// icu.h is autogenerated and merged from the ICU header files. // Code unused or not supported in the Windows ICU SDK has been removed. #if (NTDDI_VERSION >= NTDDI_WIN10_RS3) // Default Windows SDK ICU configuration options. // Alternate selections are not supported in the Windows SDK. #define U_DISABLE_RENAMING 1 #define U_SHOW_CPLUSPLUS_API 0 #define U_DEFAULT_SHOW_DRAFT 0 #define U_HIDE_DRAFT_API 1 #define U_HIDE_DEPRECATED_API 1 #define U_HIDE_OBSOLETE_API 1 #define U_HIDE_INTERNAL_API 1 #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1 // appendable.h // No supported content // brkiter.h // No supported content // bytestream.h // No supported content // bytestrie.h // No supported content // bytestriebuilder.h // No supported content // caniter.h // No supported content // casemap.h // No supported content // char16ptr.h // No supported content // chariter.h // No supported content // dbbi.h // No supported content // docmain.h // No supported content // dtintrv.h // No supported content // edits.h // No supported content // enumset.h // No supported content // errorcode.h // No supported content // filteredbrk.h // No supported content // icuplug.h // No supported content // idna.h // No supported content // localebuilder.h // No supported content // localematcher.h // No supported content // localpointer.h // No supported content // locdspnm.h // No supported content // locid.h // No supported content // messagepattern.h // No supported content // normalizer2.h // No supported content // normlzr.h // No supported content // parsepos.h // No supported content // rbbi.h // No supported content // rep.h // No supported content // resbund.h // No supported content // schriter.h // No supported content // simpleformatter.h // No supported content // std_string.h // No supported content // strenum.h // No supported content // stringpiece.h // No supported content // stringtriebuilder.h // No supported content // symtable.h // No supported content // ucharstrie.h // No supported content // ucharstriebuilder.h // No supported content // uchriter.h // No supported content // uconfig.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: uconfig.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2002sep19 * created by: Markus W. Scherer */ #ifndef __UCONFIG_H__ #define __UCONFIG_H__ /*! * \file * \brief User-configurable settings * * Miscellaneous switches: * * A number of macros affect a variety of minor aspects of ICU. * Most of them used to be defined elsewhere (e.g., in utypes.h or platform.h) * and moved here to make them easier to find. * * Switches for excluding parts of ICU library code modules: * * Changing these macros allows building partial, smaller libraries for special purposes. * By default, all modules are built. * The switches are fairly coarse, controlling large modules. * Basic services cannot be turned off. * * Building with any of these options does not guarantee that the * ICU build process will completely work. It is recommended that * the ICU libraries and data be built using the normal build. * At that time you should remove the data used by those services. * After building the ICU data library, you should rebuild the ICU * libraries with these switches customized to your needs. * * @stable ICU 2.4 */ /** * \def U_DEBUG * Determines whether to include debugging code. * Automatically set on Windows, but most compilers do not have * related predefined macros. * @internal */ #ifdef U_DEBUG /* Use the predefined value. */ #elif defined(_DEBUG) /* * _DEBUG is defined by Visual Studio debug compilation. * Do *not* test for its NDEBUG macro: It is an orthogonal macro * which disables assert(). */ # define U_DEBUG 1 # else # define U_DEBUG 0 #endif /** * Determines whether to enable auto cleanup of libraries. * @internal */ #ifndef UCLN_NO_AUTO_CLEANUP #define UCLN_NO_AUTO_CLEANUP 1 #endif /** * \def U_DISABLE_RENAMING * Determines whether to disable renaming or not. * @internal */ #ifndef U_DISABLE_RENAMING #endif /** * \def U_NO_DEFAULT_INCLUDE_UTF_HEADERS * Determines whether utypes.h includes utf.h, utf8.h, utf16.h and utf_old.h. * utypes.h includes those headers if this macro is defined to 0. * Otherwise, each those headers must be included explicitly when using one of their macros. * Defaults to 0 for backward compatibility, except inside ICU. * @stable ICU 49 */ /** * \def U_OVERRIDE_CXX_ALLOCATION * Determines whether to override new and delete. * ICU is normally built such that all of its C++ classes, via their UMemory base, * override operators new and delete to use its internal, customizable, * non-exception-throwing memory allocation functions. (Default value 1 for this macro.) * * This is especially important when the application and its libraries use multiple heaps. * For example, on Windows, this allows the ICU DLL to be used by * applications that statically link the C Runtime library. * * @stable ICU 2.2 */ #ifndef U_OVERRIDE_CXX_ALLOCATION #define U_OVERRIDE_CXX_ALLOCATION 1 #endif /** * \def U_ENABLE_TRACING * Determines whether to enable tracing. * @internal */ #ifndef U_ENABLE_TRACING #define U_ENABLE_TRACING 0 #endif /** * \def UCONFIG_ENABLE_PLUGINS * Determines whether to enable ICU plugins. * @internal */ #ifndef UCONFIG_ENABLE_PLUGINS #define UCONFIG_ENABLE_PLUGINS 0 #endif /** * \def U_ENABLE_DYLOAD * Whether to enable Dynamic loading in ICU. * @internal */ #ifndef U_ENABLE_DYLOAD #define U_ENABLE_DYLOAD 1 #endif /** * \def U_CHECK_DYLOAD * Whether to test Dynamic loading as an OS capability. * @internal */ #ifndef U_CHECK_DYLOAD #define U_CHECK_DYLOAD 1 #endif /** * \def U_DEFAULT_SHOW_DRAFT * Do we allow ICU users to use the draft APIs by default? * @internal */ #ifndef U_DEFAULT_SHOW_DRAFT #define U_DEFAULT_SHOW_DRAFT 1 #endif /*===========================================================================*/ /* Custom icu entry point renaming */ /*===========================================================================*/ /** * \def U_HAVE_LIB_SUFFIX * 1 if a custom library suffix is set. * @internal */ #ifdef U_HAVE_LIB_SUFFIX /* Use the predefined value. */ #elif defined(U_LIB_SUFFIX_C_NAME) || defined(U_IN_DOXYGEN) # define U_HAVE_LIB_SUFFIX 1 #endif /** * \def U_LIB_SUFFIX_C_NAME_STRING * Defines the library suffix as a string with C syntax. * @internal */ #ifdef U_LIB_SUFFIX_C_NAME_STRING /* Use the predefined value. */ #elif defined(U_LIB_SUFFIX_C_NAME) # define CONVERT_TO_STRING(s) #s # define U_LIB_SUFFIX_C_NAME_STRING CONVERT_TO_STRING(U_LIB_SUFFIX_C_NAME) #else # define U_LIB_SUFFIX_C_NAME_STRING "" #endif /* common/i18n library switches --------------------------------------------- */ /** * \def UCONFIG_ONLY_COLLATION * This switch turns off modules that are not needed for collation. * * It does not turn off legacy conversion because that is necessary * for ICU to work on EBCDIC platforms (for the default converter). * If you want "only collation" and do not build for EBCDIC, * then you can define UCONFIG_NO_CONVERSION or UCONFIG_NO_LEGACY_CONVERSION to 1 as well. * * @stable ICU 2.4 */ #ifndef UCONFIG_ONLY_COLLATION # define UCONFIG_ONLY_COLLATION 0 #endif #if UCONFIG_ONLY_COLLATION /* common library */ # define UCONFIG_NO_BREAK_ITERATION 1 # define UCONFIG_NO_IDNA 1 /* i18n library */ # if UCONFIG_NO_COLLATION # error Contradictory collation switches in uconfig.h. # endif # define UCONFIG_NO_FORMATTING 1 # define UCONFIG_NO_TRANSLITERATION 1 # define UCONFIG_NO_REGULAR_EXPRESSIONS 1 #endif /* common library switches -------------------------------------------------- */ /** * \def UCONFIG_NO_FILE_IO * This switch turns off all file access in the common library * where file access is only used for data loading. * ICU data must then be provided in the form of a data DLL (or with an * equivalent way to link to the data residing in an executable, * as in building a combined library with both the common library's code and * the data), or via udata_setCommonData(). * Application data must be provided via udata_setAppData() or by using * "open" functions that take pointers to data, for example ucol_openBinary(). * * File access is not used at all in the i18n library. * * File access cannot be turned off for the icuio library or for the ICU * test suites and ICU tools. * * @stable ICU 3.6 */ #ifndef UCONFIG_NO_FILE_IO # define UCONFIG_NO_FILE_IO 0 #endif #if UCONFIG_NO_FILE_IO && defined(U_TIMEZONE_FILES_DIR) # error Contradictory file io switches in uconfig.h. #endif /** * \def UCONFIG_NO_CONVERSION * ICU will not completely build (compiling the tools fails) with this * switch turned on. * This switch turns off all converters. * * You may want to use this together with U_CHARSET_IS_UTF8 defined to 1 * in utypes.h if char* strings in your environment are always in UTF-8. * * @stable ICU 3.2 * @see U_CHARSET_IS_UTF8 */ #ifndef UCONFIG_NO_CONVERSION # define UCONFIG_NO_CONVERSION 0 #endif #if UCONFIG_NO_CONVERSION # define UCONFIG_NO_LEGACY_CONVERSION 1 #endif /** * \def UCONFIG_ONLY_HTML_CONVERSION * This switch turns off all of the converters NOT listed in * the HTML encoding standard: * http://www.w3.org/TR/encoding/#names-and-labels * * This is not possible on EBCDIC platforms * because they need ibm-37 or ibm-1047 default converters. * * @stable ICU 55 */ #ifndef UCONFIG_ONLY_HTML_CONVERSION # define UCONFIG_ONLY_HTML_CONVERSION 0 #endif /** * \def UCONFIG_NO_LEGACY_CONVERSION * This switch turns off all converters except for * - Unicode charsets (UTF-7/8/16/32, CESU-8, SCSU, BOCU-1) * - US-ASCII * - ISO-8859-1 * * Turning off legacy conversion is not possible on EBCDIC platforms * because they need ibm-37 or ibm-1047 default converters. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_LEGACY_CONVERSION # define UCONFIG_NO_LEGACY_CONVERSION 0 #endif /** * \def UCONFIG_NO_NORMALIZATION * This switch turns off normalization. * It implies turning off several other services as well, for example * collation and IDNA. * * @stable ICU 2.6 */ #ifndef UCONFIG_NO_NORMALIZATION # define UCONFIG_NO_NORMALIZATION 0 #endif #if UCONFIG_NO_NORMALIZATION /* common library */ /* ICU 50 CJK dictionary BreakIterator uses normalization */ # define UCONFIG_NO_BREAK_ITERATION 1 /* IDNA (UTS #46) is implemented via normalization */ # define UCONFIG_NO_IDNA 1 /* i18n library */ # if UCONFIG_ONLY_COLLATION # error Contradictory collation switches in uconfig.h. # endif # define UCONFIG_NO_COLLATION 1 # define UCONFIG_NO_TRANSLITERATION 1 #endif /** * \def UCONFIG_NO_BREAK_ITERATION * This switch turns off break iteration. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_BREAK_ITERATION # define UCONFIG_NO_BREAK_ITERATION 0 #endif /** * \def UCONFIG_NO_IDNA * This switch turns off IDNA. * * @stable ICU 2.6 */ #ifndef UCONFIG_NO_IDNA # define UCONFIG_NO_IDNA 0 #endif /** * \def UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE * Determines the default UMessagePatternApostropheMode. * See the documentation for that enum. * * @stable ICU 4.8 */ #ifndef UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE # define UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE UMSGPAT_APOS_DOUBLE_OPTIONAL #endif /* i18n library switches ---------------------------------------------------- */ /** * \def UCONFIG_NO_COLLATION * This switch turns off collation and collation-based string search. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_COLLATION # define UCONFIG_NO_COLLATION 0 #endif /** * \def UCONFIG_NO_FORMATTING * This switch turns off formatting and calendar/timezone services. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_FORMATTING # define UCONFIG_NO_FORMATTING 0 #endif /** * \def UCONFIG_NO_TRANSLITERATION * This switch turns off transliteration. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_TRANSLITERATION # define UCONFIG_NO_TRANSLITERATION 0 #endif /** * \def UCONFIG_NO_REGULAR_EXPRESSIONS * This switch turns off regular expressions. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_REGULAR_EXPRESSIONS # define UCONFIG_NO_REGULAR_EXPRESSIONS 0 #endif /** * \def UCONFIG_NO_SERVICE * This switch turns off service registration. * * @stable ICU 3.2 */ #ifndef UCONFIG_NO_SERVICE # define UCONFIG_NO_SERVICE 0 #endif /** * \def UCONFIG_HAVE_PARSEALLINPUT * This switch turns on the "parse all input" attribute. Binary incompatible. * * @internal */ #ifndef UCONFIG_HAVE_PARSEALLINPUT # define UCONFIG_HAVE_PARSEALLINPUT 1 #endif /** * \def UCONFIG_NO_FILTERED_BREAK_ITERATION * This switch turns off filtered break iteration code. * * @internal */ #ifndef UCONFIG_NO_FILTERED_BREAK_ITERATION # define UCONFIG_NO_FILTERED_BREAK_ITERATION 0 #endif #endif // __UCONFIG_H__ // udata.h // No supported content // unifilt.h // No supported content // unifunct.h // No supported content // unimatch.h // No supported content // uniset.h // No supported content // unistr.h // No supported content // uobject.h // No supported content // urename.h // No supported content // usetiter.h // No supported content // utf32.h // No supported content // uvernum.h // No supported content // platform.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * FILE NAME : platform.h * * Date Name Description * 05/13/98 nos Creation (content moved here from ptypes.h). * 03/02/99 stephen Added AS400 support. * 03/30/99 stephen Added Linux support. * 04/13/99 stephen Reworked for autoconf. ****************************************************************************** */ #ifndef _PLATFORM_H #define _PLATFORM_H /** * \file * \brief Basic types for the platform. * * This file used to be generated by autoconf/configure. * Starting with ICU 49, platform.h is a normal source file, * to simplify cross-compiling and working with non-autoconf/make build systems. * * When a value in this file does not work on a platform, then please * try to derive it from the U_PLATFORM value * (for which we might need a new value constant in rare cases) * and/or from other macros that are predefined by the compiler * or defined in standard (POSIX or platform or compiler) headers. * * As a temporary workaround, you can add an explicit \#define for some macros * before it is first tested, or add an equivalent -D macro definition * to the compiler's command line. * * Note: Some compilers provide ways to show the predefined macros. * For example, with gcc you can compile an empty .c file and have the compiler * print the predefined macros with * \code * gcc -E -dM -x c /dev/null | sort * \endcode * (You can provide an actual empty .c file rather than /dev/null. * -x c++ is for C++.) */ /** * Define some things so that they can be documented. * @internal */ #ifdef U_IN_DOXYGEN /* * Problem: "platform.h:335: warning: documentation for unknown define U_HAVE_STD_STRING found." means that U_HAVE_STD_STRING is not documented. * Solution: #define any defines for non @internal API here, so that they are visible in the docs. If you just set PREDEFINED in Doxyfile.in, they won't be documented. */ /* None for now. */ #endif /** * \def U_PLATFORM * The U_PLATFORM macro defines the platform we're on. * * We used to define one different, value-less macro per platform. * That made it hard to know the set of relevant platforms and macros, * and hard to deal with variants of platforms. * * Starting with ICU 49, we define platforms as numeric macros, * with ranges of values for related platforms and their variants. * The U_PLATFORM macro is set to one of these values. * * Historical note from the Solaris Wikipedia article: * AT&T and Sun collaborated on a project to merge the most popular Unix variants * on the market at that time: BSD, System V, and Xenix. * This became Unix System V Release 4 (SVR4). * * @internal */ /** Unknown platform. @internal */ #define U_PF_UNKNOWN 0 /** Windows @internal */ #define U_PF_WINDOWS 1000 /** MinGW. Windows, calls to Win32 API, but using GNU gcc and binutils. @internal */ #define U_PF_MINGW 1800 /** * Cygwin. Windows, calls to cygwin1.dll for Posix functions, * using MSVC or GNU gcc and binutils. * @internal */ #define U_PF_CYGWIN 1900 /* Reserve 2000 for U_PF_UNIX? */ /** HP-UX is based on UNIX System V. @internal */ #define U_PF_HPUX 2100 /** Solaris is a Unix operating system based on SVR4. @internal */ #define U_PF_SOLARIS 2600 /** BSD is a UNIX operating system derivative. @internal */ #define U_PF_BSD 3000 /** AIX is based on UNIX System V Releases and 4.3 BSD. @internal */ #define U_PF_AIX 3100 /** IRIX is based on UNIX System V with BSD extensions. @internal */ #define U_PF_IRIX 3200 /** * Darwin is a POSIX-compliant operating system, composed of code developed by Apple, * as well as code derived from NeXTSTEP, BSD, and other projects, * built around the Mach kernel. * Darwin forms the core set of components upon which Mac OS X, Apple TV, and iOS are based. * (Original description modified from WikiPedia.) * @internal */ #define U_PF_DARWIN 3500 /** iPhone OS (iOS) is a derivative of Mac OS X. @internal */ #define U_PF_IPHONE 3550 /** QNX is a commercial Unix-like real-time operating system related to BSD. @internal */ #define U_PF_QNX 3700 /** Linux is a Unix-like operating system. @internal */ #define U_PF_LINUX 4000 /** * Native Client is pretty close to Linux. * See https://developer.chrome.com/native-client and * http://www.chromium.org/nativeclient * @internal */ #define U_PF_BROWSER_NATIVE_CLIENT 4020 /** Android is based on Linux. @internal */ #define U_PF_ANDROID 4050 /** Fuchsia is a POSIX-ish platform. @internal */ #define U_PF_FUCHSIA 4100 /* Maximum value for Linux-based platform is 4499 */ /** * Emscripten is a C++ transpiler for the Web that can target asm.js or * WebAssembly. It provides some POSIX-compatible wrappers and stubs and * some Linux-like functionality, but is not fully compatible with * either. * @internal */ #define U_PF_EMSCRIPTEN 5010 /** z/OS is the successor to OS/390 which was the successor to MVS. @internal */ #define U_PF_OS390 9000 /** "IBM i" is the current name of what used to be i5/OS and earlier OS/400. @internal */ #define U_PF_OS400 9400 #ifdef U_PLATFORM /* Use the predefined value. */ #elif defined(__MINGW32__) # define U_PLATFORM U_PF_MINGW #elif defined(__CYGWIN__) # define U_PLATFORM U_PF_CYGWIN #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) # define U_PLATFORM U_PF_WINDOWS #elif defined(__ANDROID__) # define U_PLATFORM U_PF_ANDROID /* Android wchar_t support depends on the API level. */ # include #elif defined(__pnacl__) || defined(__native_client__) # define U_PLATFORM U_PF_BROWSER_NATIVE_CLIENT #elif defined(__Fuchsia__) # define U_PLATFORM U_PF_FUCHSIA #elif defined(linux) || defined(__linux__) || defined(__linux) # define U_PLATFORM U_PF_LINUX #elif defined(__APPLE__) && defined(__MACH__) # include # if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) && (defined(TARGET_OS_MACCATALYST) && !TARGET_OS_MACCATALYST) /* variant of TARGET_OS_MAC */ # define U_PLATFORM U_PF_IPHONE # else # define U_PLATFORM U_PF_DARWIN # endif #elif defined(BSD) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__MirBSD__) # if defined(__FreeBSD__) # include # endif # define U_PLATFORM U_PF_BSD #elif defined(sun) || defined(__sun) /* Check defined(__SVR4) || defined(__svr4__) to distinguish Solaris from SunOS? */ # define U_PLATFORM U_PF_SOLARIS # if defined(__GNUC__) /* Solaris/GCC needs this header file to get the proper endianness. Normally, this * header file is included with stddef.h but on Solairs/GCC, the GCC version of stddef.h * is included which does not include this header file. */ # include # endif #elif defined(_AIX) || defined(__TOS_AIX__) # define U_PLATFORM U_PF_AIX #elif defined(_hpux) || defined(hpux) || defined(__hpux) # define U_PLATFORM U_PF_HPUX #elif defined(sgi) || defined(__sgi) # define U_PLATFORM U_PF_IRIX #elif defined(__QNX__) || defined(__QNXNTO__) # define U_PLATFORM U_PF_QNX #elif defined(__TOS_MVS__) # define U_PLATFORM U_PF_OS390 #elif defined(__OS400__) || defined(__TOS_OS400__) # define U_PLATFORM U_PF_OS400 #elif defined(__EMSCRIPTEN__) # define U_PLATFORM U_PF_EMSCRIPTEN #else # define U_PLATFORM U_PF_UNKNOWN #endif /** * \def CYGWINMSVC * Defined if this is Windows with Cygwin, but using MSVC rather than gcc. * Otherwise undefined. * @internal */ /* Commented out because this is already set in mh-cygwin-msvc #if U_PLATFORM == U_PF_CYGWIN && defined(_MSC_VER) # define CYGWINMSVC #endif */ #ifdef U_IN_DOXYGEN # define CYGWINMSVC #endif /** * \def U_PLATFORM_USES_ONLY_WIN32_API * Defines whether the platform uses only the Win32 API. * Set to 1 for Windows/MSVC and MinGW but not Cygwin. * @internal */ #ifdef U_PLATFORM_USES_ONLY_WIN32_API /* Use the predefined value. */ #elif (U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_MINGW) || defined(CYGWINMSVC) # define U_PLATFORM_USES_ONLY_WIN32_API 1 #else /* Cygwin implements POSIX. */ # define U_PLATFORM_USES_ONLY_WIN32_API 0 #endif /** * \def U_PLATFORM_HAS_WIN32_API * Defines whether the Win32 API is available on the platform. * Set to 1 for Windows/MSVC, MinGW and Cygwin. * @internal */ #ifdef U_PLATFORM_HAS_WIN32_API /* Use the predefined value. */ #elif U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN # define U_PLATFORM_HAS_WIN32_API 1 #else # define U_PLATFORM_HAS_WIN32_API 0 #endif /** * \def U_PLATFORM_HAS_WINUWP_API * Defines whether target is intended for Universal Windows Platform API * Set to 1 for Windows10 Release Solution Configuration * @internal */ #ifdef U_PLATFORM_HAS_WINUWP_API /* Use the predefined value. */ #else #endif /** * \def U_PLATFORM_IMPLEMENTS_POSIX * Defines whether the platform implements (most of) the POSIX API. * Set to 1 for Cygwin and most other platforms. * @internal */ #ifdef U_PLATFORM_IMPLEMENTS_POSIX /* Use the predefined value. */ #elif U_PLATFORM_USES_ONLY_WIN32_API # define U_PLATFORM_IMPLEMENTS_POSIX 0 #else # define U_PLATFORM_IMPLEMENTS_POSIX 1 #endif /** * \def U_PLATFORM_IS_LINUX_BASED * Defines whether the platform is Linux or one of its derivatives. * @internal */ #ifdef U_PLATFORM_IS_LINUX_BASED /* Use the predefined value. */ #elif U_PF_LINUX <= U_PLATFORM && U_PLATFORM <= 4499 # define U_PLATFORM_IS_LINUX_BASED 1 #else # define U_PLATFORM_IS_LINUX_BASED 0 #endif /** * \def U_PLATFORM_IS_DARWIN_BASED * Defines whether the platform is Darwin or one of its derivatives. * @internal */ #ifdef U_PLATFORM_IS_DARWIN_BASED /* Use the predefined value. */ #elif U_PF_DARWIN <= U_PLATFORM && U_PLATFORM <= U_PF_IPHONE # define U_PLATFORM_IS_DARWIN_BASED 1 #else # define U_PLATFORM_IS_DARWIN_BASED 0 #endif /** * \def U_HAVE_STDINT_H * Defines whether stdint.h is available. It is a C99 standard header. * We used to include inttypes.h which includes stdint.h but we usually do not need * the additional definitions from inttypes.h. * @internal */ #ifdef U_HAVE_STDINT_H /* Use the predefined value. */ #elif U_PLATFORM_USES_ONLY_WIN32_API # if defined(__BORLANDC__) || U_PLATFORM == U_PF_MINGW || (defined(_MSC_VER) && _MSC_VER>=1600) /* Windows Visual Studio 9 and below do not have stdint.h & inttypes.h, but VS 2010 adds them. */ # define U_HAVE_STDINT_H 1 # else # define U_HAVE_STDINT_H 0 # endif #elif U_PLATFORM == U_PF_SOLARIS /* Solaris has inttypes.h but not stdint.h. */ # define U_HAVE_STDINT_H 0 #elif U_PLATFORM == U_PF_AIX && !defined(_AIX51) && defined(_POWER) /* PPC AIX <= 4.3 has inttypes.h but not stdint.h. */ # define U_HAVE_STDINT_H 0 #else # define U_HAVE_STDINT_H 1 #endif /** * \def U_HAVE_INTTYPES_H * Defines whether inttypes.h is available. It is a C99 standard header. * We include inttypes.h where it is available but stdint.h is not. * @internal */ #ifdef U_HAVE_INTTYPES_H /* Use the predefined value. */ #elif U_PLATFORM == U_PF_SOLARIS /* Solaris has inttypes.h but not stdint.h. */ # define U_HAVE_INTTYPES_H 1 #elif U_PLATFORM == U_PF_AIX && !defined(_AIX51) && defined(_POWER) /* PPC AIX <= 4.3 has inttypes.h but not stdint.h. */ # define U_HAVE_INTTYPES_H 1 #else /* Most platforms have both inttypes.h and stdint.h, or neither. */ # define U_HAVE_INTTYPES_H U_HAVE_STDINT_H #endif /*===========================================================================*/ /** @{ Compiler and environment features */ /*===========================================================================*/ /** * \def U_GCC_MAJOR_MINOR * Indicates whether the compiler is gcc (test for != 0), * and if so, contains its major (times 100) and minor version numbers. * If the compiler is not gcc, then U_GCC_MAJOR_MINOR == 0. * * For example, for testing for whether we have gcc, and whether it's 4.6 or higher, * use "#if U_GCC_MAJOR_MINOR >= 406". * @internal */ #ifdef __GNUC__ # define U_GCC_MAJOR_MINOR (__GNUC__ * 100 + __GNUC_MINOR__) #else # define U_GCC_MAJOR_MINOR 0 #endif /** * \def U_IS_BIG_ENDIAN * Determines the endianness of the platform. * @internal */ # define U_IS_BIG_ENDIAN 0 /** * \def U_HAVE_PLACEMENT_NEW * Determines whether to override placement new and delete for STL. * @stable ICU 2.6 */ #ifdef U_HAVE_PLACEMENT_NEW /* Use the predefined value. */ #elif defined(__BORLANDC__) # define U_HAVE_PLACEMENT_NEW 0 #else # define U_HAVE_PLACEMENT_NEW 1 #endif /** * \def U_HAVE_DEBUG_LOCATION_NEW * Define this to define the MFC debug version of the operator new. * * @stable ICU 3.4 */ #ifdef U_HAVE_DEBUG_LOCATION_NEW /* Use the predefined value. */ #elif defined(_MSC_VER) # define U_HAVE_DEBUG_LOCATION_NEW 1 #else # define U_HAVE_DEBUG_LOCATION_NEW 0 #endif /* Compatibility with compilers other than clang: http://clang.llvm.org/docs/LanguageExtensions.html */ #ifdef __has_attribute # define UPRV_HAS_ATTRIBUTE(x) __has_attribute(x) #else # define UPRV_HAS_ATTRIBUTE(x) 0 #endif #ifdef __has_cpp_attribute # define UPRV_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) #else # define UPRV_HAS_CPP_ATTRIBUTE(x) 0 #endif #ifdef __has_declspec_attribute # define UPRV_HAS_DECLSPEC_ATTRIBUTE(x) __has_declspec_attribute(x) #else # define UPRV_HAS_DECLSPEC_ATTRIBUTE(x) 0 #endif #ifdef __has_builtin # define UPRV_HAS_BUILTIN(x) __has_builtin(x) #else # define UPRV_HAS_BUILTIN(x) 0 #endif #ifdef __has_feature # define UPRV_HAS_FEATURE(x) __has_feature(x) #else # define UPRV_HAS_FEATURE(x) 0 #endif #ifdef __has_extension # define UPRV_HAS_EXTENSION(x) __has_extension(x) #else # define UPRV_HAS_EXTENSION(x) 0 #endif #ifdef __has_warning # define UPRV_HAS_WARNING(x) __has_warning(x) #else # define UPRV_HAS_WARNING(x) 0 #endif /** * \def U_MALLOC_ATTR * Attribute to mark functions as malloc-like * @internal */ #if defined(__GNUC__) && __GNUC__>=3 # define U_MALLOC_ATTR __attribute__ ((__malloc__)) #else # define U_MALLOC_ATTR #endif /** * \def U_ALLOC_SIZE_ATTR * Attribute to specify the size of the allocated buffer for malloc-like functions * @internal */ #if (defined(__GNUC__) && \ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) || \ UPRV_HAS_ATTRIBUTE(alloc_size) # define U_ALLOC_SIZE_ATTR(X) __attribute__ ((alloc_size(X))) # define U_ALLOC_SIZE_ATTR2(X,Y) __attribute__ ((alloc_size(X,Y))) #else # define U_ALLOC_SIZE_ATTR(X) # define U_ALLOC_SIZE_ATTR2(X,Y) #endif /** * \def U_CPLUSPLUS_VERSION * 0 if no C++; 1, 11, 14, ... if C++. * Support for specific features cannot always be determined by the C++ version alone. * @internal */ #ifdef U_CPLUSPLUS_VERSION # if U_CPLUSPLUS_VERSION != 0 && !defined(__cplusplus) # undef U_CPLUSPLUS_VERSION # define U_CPLUSPLUS_VERSION 0 # endif /* Otherwise use the predefined value. */ #elif !defined(__cplusplus) # define U_CPLUSPLUS_VERSION 0 #elif __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) # define U_CPLUSPLUS_VERSION 14 #elif __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L) # define U_CPLUSPLUS_VERSION 11 #else // C++98 or C++03 # define U_CPLUSPLUS_VERSION 1 #endif #if (U_PLATFORM == U_PF_AIX || U_PLATFORM == U_PF_OS390) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11) // add in std::nullptr_t namespace std { typedef decltype(nullptr) nullptr_t; }; #endif /** * \def U_NOEXCEPT * "noexcept" if supported, otherwise empty. * Some code, especially STL containers, uses move semantics of objects only * if the move constructor and the move operator are declared as not throwing exceptions. * @internal */ #ifdef U_NOEXCEPT /* Use the predefined value. */ #else # define U_NOEXCEPT noexcept #endif /** * \def U_FALLTHROUGH * Annotate intentional fall-through between switch labels. * http://clang.llvm.org/docs/AttributeReference.html#fallthrough-clang-fallthrough * @internal */ #ifndef __cplusplus // Not for C. #elif defined(U_FALLTHROUGH) // Use the predefined value. #elif defined(__clang__) // Test for compiler vs. feature separately. // Other compilers might choke on the feature test. # if UPRV_HAS_CPP_ATTRIBUTE(clang::fallthrough) || \ (UPRV_HAS_FEATURE(cxx_attributes) && \ UPRV_HAS_WARNING("-Wimplicit-fallthrough")) # define U_FALLTHROUGH [[clang::fallthrough]] # endif #elif defined(__GNUC__) && (__GNUC__ >= 7) # define U_FALLTHROUGH __attribute__((fallthrough)) #endif #ifndef U_FALLTHROUGH # define U_FALLTHROUGH #endif /** @} */ /*===========================================================================*/ /** @{ Character data types */ /*===========================================================================*/ /** * U_CHARSET_FAMILY is equal to this value when the platform is an ASCII based platform. * @stable ICU 2.0 */ #define U_ASCII_FAMILY 0 /** * U_CHARSET_FAMILY is equal to this value when the platform is an EBCDIC based platform. * @stable ICU 2.0 */ #define U_EBCDIC_FAMILY 1 /** * \def U_CHARSET_FAMILY * *

These definitions allow to specify the encoding of text * in the char data type as defined by the platform and the compiler. * It is enough to determine the code point values of "invariant characters", * which are the ones shared by all encodings that are in use * on a given platform.

* *

Those "invariant characters" should be all the uppercase and lowercase * latin letters, the digits, the space, and "basic punctuation". * Also, '\\n', '\\r', '\\t' should be available.

* *

The list of "invariant characters" is:
* \code * A-Z a-z 0-9 SPACE " % & ' ( ) * + , - . / : ; < = > ? _ * \endcode *
* (52 letters + 10 numbers + 20 punc/sym/space = 82 total)

* *

This matches the IBM Syntactic Character Set (CS 640).

* *

In other words, all the graphic characters in 7-bit ASCII should * be safely accessible except the following:

* * \code * '\' * '[' * ']' * '{' * '}' * '^' * '~' * '!' * '#' * '|' * '$' * '@' * '`' * \endcode * @stable ICU 2.0 */ #ifdef U_CHARSET_FAMILY /* Use the predefined value. */ #elif U_PLATFORM == U_PF_OS390 && (!defined(__CHARSET_LIB) || !__CHARSET_LIB) # define U_CHARSET_FAMILY U_EBCDIC_FAMILY #elif U_PLATFORM == U_PF_OS400 && !defined(__UTF32__) # define U_CHARSET_FAMILY U_EBCDIC_FAMILY #else # define U_CHARSET_FAMILY U_ASCII_FAMILY #endif /** * \def U_CHARSET_IS_UTF8 * * Hardcode the default charset to UTF-8. * * If this is set to 1, then * - ICU will assume that all non-invariant char*, StringPiece, std::string etc. * contain UTF-8 text, regardless of what the system API uses * - some ICU code will use fast functions like u_strFromUTF8() * rather than the more general and more heavy-weight conversion API (ucnv.h) * - ucnv_getDefaultName() always returns "UTF-8" * - ucnv_setDefaultName() is disabled and will not change the default charset * - static builds of ICU are smaller * - more functionality is available with the UCONFIG_NO_CONVERSION build-time * configuration option (see unicode/uconfig.h) * - the UCONFIG_NO_CONVERSION build option in uconfig.h is more usable * * @stable ICU 4.2 * @see UCONFIG_NO_CONVERSION */ #ifdef U_CHARSET_IS_UTF8 /* Use the predefined value. */ #elif U_PLATFORM_IS_LINUX_BASED || U_PLATFORM_IS_DARWIN_BASED || \ U_PLATFORM == U_PF_EMSCRIPTEN # define U_CHARSET_IS_UTF8 1 #else # define U_CHARSET_IS_UTF8 0 #endif /** @} */ /*===========================================================================*/ /** @{ Information about wchar support */ /*===========================================================================*/ /** * \def U_HAVE_WCHAR_H * Indicates whether is available (1) or not (0). Set to 1 by default. * * @stable ICU 2.0 */ #ifdef U_HAVE_WCHAR_H /* Use the predefined value. */ #elif U_PLATFORM == U_PF_ANDROID && __ANDROID_API__ < 9 /* * Android before Gingerbread (Android 2.3, API level 9) did not support wchar_t. * The type and header existed, but the library functions did not work as expected. * The size of wchar_t was 1 but L"xyz" string literals had 32-bit units anyway. */ # define U_HAVE_WCHAR_H 0 #else # define U_HAVE_WCHAR_H 1 #endif /** * \def U_SIZEOF_WCHAR_T * U_SIZEOF_WCHAR_T==sizeof(wchar_t) * * @stable ICU 2.0 */ #ifdef U_SIZEOF_WCHAR_T /* Use the predefined value. */ #elif (U_PLATFORM == U_PF_ANDROID && __ANDROID_API__ < 9) /* * Classic Mac OS and Mac OS X before 10.3 (Panther) did not support wchar_t or wstring. * Newer Mac OS X has size 4. */ # define U_SIZEOF_WCHAR_T 1 #elif U_PLATFORM_HAS_WIN32_API || U_PLATFORM == U_PF_CYGWIN # define U_SIZEOF_WCHAR_T 2 #elif U_PLATFORM == U_PF_AIX /* * AIX 6.1 information, section "Wide character data representation": * "... the wchar_t datatype is 32-bit in the 64-bit environment and * 16-bit in the 32-bit environment." * and * "All locales use Unicode for their wide character code values (process code), * except the IBM-eucTW codeset." */ # ifdef __64BIT__ # define U_SIZEOF_WCHAR_T 4 # else # define U_SIZEOF_WCHAR_T 2 # endif #elif U_PLATFORM == U_PF_OS390 /* * z/OS V1R11 information center, section "LP64 | ILP32": * "In 31-bit mode, the size of long and pointers is 4 bytes and the size of wchar_t is 2 bytes. * Under LP64, the size of long and pointer is 8 bytes and the size of wchar_t is 4 bytes." */ # ifdef _LP64 # define U_SIZEOF_WCHAR_T 4 # else # define U_SIZEOF_WCHAR_T 2 # endif #elif U_PLATFORM == U_PF_OS400 # if defined(__UTF32__) /* * LOCALETYPE(*LOCALEUTF) is specified. * Wide-character strings are in UTF-32, * narrow-character strings are in UTF-8. */ # define U_SIZEOF_WCHAR_T 4 # elif defined(__UCS2__) /* * LOCALETYPE(*LOCALEUCS2) is specified. * Wide-character strings are in UCS-2, * narrow-character strings are in EBCDIC. */ # define U_SIZEOF_WCHAR_T 2 # else /* * LOCALETYPE(*CLD) or LOCALETYPE(*LOCALE) is specified. * Wide-character strings are in 16-bit EBCDIC, * narrow-character strings are in EBCDIC. */ # define U_SIZEOF_WCHAR_T 2 # endif #else # define U_SIZEOF_WCHAR_T 4 #endif #ifndef U_HAVE_WCSCPY #define U_HAVE_WCSCPY U_HAVE_WCHAR_H #endif /** @} */ /** * \def U_HAVE_CHAR16_T * Defines whether the char16_t type is available for UTF-16 * and u"abc" UTF-16 string literals are supported. * This is a new standard type and standard string literal syntax in C++0x * but has been available in some compilers before. * @internal */ #ifdef U_HAVE_CHAR16_T /* Use the predefined value. */ #else /* * Notes: * Visual Studio 2010 (_MSC_VER==1600) defines char16_t as a typedef * and does not support u"abc" string literals. * Visual Studio 2015 (_MSC_VER>=1900) and above adds support for * both char16_t and u"abc" string literals. * gcc 4.4 defines the __CHAR16_TYPE__ macro to a usable type but * does not support u"abc" string literals. * C++11 and C11 require support for UTF-16 literals * TODO: Fix for plain C. Doesn't work on Mac. */ # if U_CPLUSPLUS_VERSION >= 11 || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) # define U_HAVE_CHAR16_T 1 # else # define U_HAVE_CHAR16_T 0 # endif #endif /** * @{ * \def U_DECLARE_UTF16 * Do not use this macro because it is not defined on all platforms. * Use the UNICODE_STRING or U_STRING_DECL macros instead. * @internal */ #ifdef U_DECLARE_UTF16 /* Use the predefined value. */ #elif U_HAVE_CHAR16_T \ || (defined(__xlC__) && defined(__IBM_UTF_LITERAL) && U_SIZEOF_WCHAR_T != 2) \ || (defined(__HP_aCC) && __HP_aCC >= 035000) \ || (defined(__HP_cc) && __HP_cc >= 111106) \ || (defined(U_IN_DOXYGEN)) # define U_DECLARE_UTF16(string) u ## string #elif U_SIZEOF_WCHAR_T == 2 \ && (U_CHARSET_FAMILY == 0 || (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400 && defined(__UCS2__))) # define U_DECLARE_UTF16(string) L ## string #else /* Leave U_DECLARE_UTF16 undefined. See unistr.h. */ #endif /** @} */ /*===========================================================================*/ /** @{ Symbol import-export control */ /*===========================================================================*/ #ifdef U_EXPORT /* Use the predefined value. */ #elif defined(U_STATIC_IMPLEMENTATION) # define U_EXPORT #elif defined(_MSC_VER) || (UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllexport__) && \ UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__)) # define U_EXPORT __declspec(dllexport) #elif defined(__GNUC__) # define U_EXPORT __attribute__((visibility("default"))) #elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \ || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550) # define U_EXPORT __global /*#elif defined(__HP_aCC) || defined(__HP_cc) # define U_EXPORT __declspec(dllexport)*/ #else # define U_EXPORT #endif /* U_CALLCONV is related to U_EXPORT2 */ #ifdef U_EXPORT2 /* Use the predefined value. */ #elif defined(_MSC_VER) # define U_EXPORT2 __cdecl #else # define U_EXPORT2 #endif #ifdef U_IMPORT /* Use the predefined value. */ #elif defined(_MSC_VER) || (UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllexport__) && \ UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__)) /* Windows needs to export/import data. */ # define U_IMPORT __declspec(dllimport) #else # define U_IMPORT #endif /** * \def U_HIDDEN * This is used to mark internal structs declared within external classes, * to prevent the internal structs from having the same visibility as the * class within which they are declared. * @internal */ #ifdef U_HIDDEN /* Use the predefined value. */ #elif defined(__GNUC__) # define U_HIDDEN __attribute__((visibility("hidden"))) #else # define U_HIDDEN #endif /** * \def U_CALLCONV * Similar to U_CDECL_BEGIN/U_CDECL_END, this qualifier is necessary * in callback function typedefs to make sure that the calling convention * is compatible. * * This is only used for non-ICU-API functions. * When a function is a public ICU API, * you must use the U_CAPI and U_EXPORT2 qualifiers. * * Please note, you need to use U_CALLCONV after the *. * * NO : "static const char U_CALLCONV *func( . . . )" * YES: "static const char* U_CALLCONV func( . . . )" * * @stable ICU 2.0 */ #if U_PLATFORM == U_PF_OS390 && defined(__cplusplus) # define U_CALLCONV __cdecl #else # define U_CALLCONV U_EXPORT2 #endif /** * \def U_CALLCONV_FPTR * Similar to U_CALLCONV, but only used on function pointers. * @internal */ #if U_PLATFORM == U_PF_OS390 && defined(__cplusplus) # define U_CALLCONV_FPTR U_CALLCONV #else # define U_CALLCONV_FPTR #endif /* @} */ #endif // _PLATFORM_H // ptypes.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 1997-2012, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * FILE NAME : ptypes.h * * Date Name Description * 05/13/98 nos Creation (content moved here from ptypes.h). * 03/02/99 stephen Added AS400 support. * 03/30/99 stephen Added Linux support. * 04/13/99 stephen Reworked for autoconf. * 09/18/08 srl Moved basic types back to ptypes.h from platform.h ****************************************************************************** */ /** * \file * \brief C API: Definitions of integer types of various widths */ #ifndef _PTYPES_H #define _PTYPES_H /** * \def __STDC_LIMIT_MACROS * According to the Linux stdint.h, the ISO C99 standard specifies that in C++ implementations * macros like INT32_MIN and UINTPTR_MAX should only be defined if explicitly requested. * We need to define __STDC_LIMIT_MACROS before including stdint.h in C++ code * that uses such limit macros. * @internal */ #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS #endif /* NULL, size_t, wchar_t */ #include /* * If all compilers provided all of the C99 headers and types, * we would just unconditionally #include here * and not need any of the stuff after including platform.h. */ /* Find out if we have stdint.h etc. */ /*===========================================================================*/ /* Generic data types */ /*===========================================================================*/ /* If your platform does not have the header, you may need to edit the typedefs in the #else section below. Use #if...#else...#endif with predefined compiler macros if possible. */ #if U_HAVE_STDINT_H /* * We mostly need (which defines the standard integer types) but not . * includes and adds the printf/scanf helpers PRId32, SCNx16 etc. * which we almost never use, plus stuff like imaxabs() which we never use. */ #include #if U_PLATFORM == U_PF_OS390 /* The features header is needed to get (u)int64_t sometimes. */ #include /* z/OS has , but some versions are missing uint8_t (APAR PK62248). */ #if !defined(__uint8_t) #define __uint8_t 1 typedef unsigned char uint8_t; #endif #endif /* U_PLATFORM == U_PF_OS390 */ #elif U_HAVE_INTTYPES_H # include #else /* neither U_HAVE_STDINT_H nor U_HAVE_INTTYPES_H */ /// \cond #if ! U_HAVE_INT8_T typedef signed char int8_t; #endif #if ! U_HAVE_UINT8_T typedef unsigned char uint8_t; #endif #if ! U_HAVE_INT16_T typedef signed short int16_t; #endif #if ! U_HAVE_UINT16_T typedef unsigned short uint16_t; #endif #if ! U_HAVE_INT32_T typedef signed int int32_t; #endif #if ! U_HAVE_UINT32_T typedef unsigned int uint32_t; #endif #if ! U_HAVE_INT64_T #ifdef _MSC_VER typedef signed __int64 int64_t; #else typedef signed long long int64_t; #endif #endif #if ! U_HAVE_UINT64_T #ifdef _MSC_VER typedef unsigned __int64 uint64_t; #else typedef unsigned long long uint64_t; #endif #endif /// \endcond #endif /* U_HAVE_STDINT_H / U_HAVE_INTTYPES_H */ #endif /* _PTYPES_H */ // umachine.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: umachine.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 1999sep13 * created by: Markus W. Scherer * * This file defines basic types and constants for ICU to be * platform-independent. umachine.h and utf.h are included into * utypes.h to provide all the general definitions for ICU. * All of these definitions used to be in utypes.h before * the UTF-handling macros made this unmaintainable. */ #ifndef __UMACHINE_H__ #define __UMACHINE_H__ /** * \file * \brief Basic types and constants for UTF * *

Basic types and constants for UTF

* This file defines basic types and constants for utf.h to be * platform-independent. umachine.h and utf.h are included into * utypes.h to provide all the general definitions for ICU. * All of these definitions used to be in utypes.h before * the UTF-handling macros made this unmaintainable. * */ /*==========================================================================*/ /* Include platform-dependent definitions */ /* which are contained in the platform-specific file platform.h */ /*==========================================================================*/ /* * ANSI C headers: * stddef.h defines wchar_t */ #if (NTDDI_VERSION >= NTDDI_WIN10_CO) // Microsoft-specific change: Non-C++ callers with older CRTs that don't have the // C99 header stdbool.h available can define the macro DONT_HAVE_STDBOOL_H before // including this header in order to avoid the dependency on stdbool.h // In this case we'll provide our own definitions for true/false, similar to the // stdbool.h header definitions. #ifndef __cplusplus #ifndef DONT_HAVE_STDBOOL_H #include #else #ifndef false #define false 0 #endif #ifndef true #define true 1 #endif #endif // DONT_HAVE_STDBOOL_H #endif // __cplusplus #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) #include /*==========================================================================*/ /* For C wrappers, we use the symbol U_CAPI. */ /* This works properly if the includer is C or C++. */ /* Functions are declared U_CAPI return-type U_EXPORT2 function-name()... */ /*==========================================================================*/ /** * \def U_CFUNC * This is used in a declaration of a library private ICU C function. * @stable ICU 2.4 */ /** * \def U_CDECL_BEGIN * This is used to begin a declaration of a library private ICU C API. * @stable ICU 2.4 */ /** * \def U_CDECL_END * This is used to end a declaration of a library private ICU C API * @stable ICU 2.4 */ #ifdef __cplusplus # define U_CFUNC extern "C" # define U_CDECL_BEGIN extern "C" { # define U_CDECL_END } #else # define U_CFUNC extern # define U_CDECL_BEGIN # define U_CDECL_END #endif #ifndef U_ATTRIBUTE_DEPRECATED /** * \def U_ATTRIBUTE_DEPRECATED * This is used for GCC specific attributes * @internal */ #if U_GCC_MAJOR_MINOR >= 302 # define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated)) /** * \def U_ATTRIBUTE_DEPRECATED * This is used for Visual C++ specific attributes * @internal */ #elif defined(_MSC_VER) && (_MSC_VER >= 1400) # define U_ATTRIBUTE_DEPRECATED __declspec(deprecated) #else # define U_ATTRIBUTE_DEPRECATED #endif #endif /** This is used to declare a function as a public ICU C API @stable ICU 2.0*/ #define U_CAPI U_CFUNC U_EXPORT /** Obsolete/same as U_CAPI; was used to declare a function as a stable public ICU C API*/ #define U_STABLE U_CAPI /** Obsolete/same as U_CAPI; was used to declare a function as a draft public ICU C API */ #define U_DRAFT U_CAPI /** This is used to declare a function as a deprecated public ICU C API */ #define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED /** Obsolete/same as U_CAPI; was used to declare a function as an obsolete public ICU C API */ #define U_OBSOLETE U_CAPI /** Obsolete/same as U_CAPI; was used to declare a function as an internal ICU C API */ #define U_INTERNAL U_CAPI // Before ICU 65, function-like, multi-statement ICU macros were just defined as // series of statements wrapped in { } blocks and the caller could choose to // either treat them as if they were actual functions and end the invocation // with a trailing ; creating an empty statement after the block or else omit // this trailing ; using the knowledge that the macro would expand to { }. // // But doing so doesn't work well with macros that look like functions and // compiler warnings about empty statements (ICU-20601) and ICU 65 therefore // switches to the standard solution of wrapping such macros in do { } while. // // This will however break existing code that depends on being able to invoke // these macros without a trailing ; so to be able to remain compatible with // such code the wrapper is itself defined as macros so that it's possible to // build ICU 65 and later with the old macro behaviour, like this: // // export CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""' // runConfigureICU ... // #if (NTDDI_VERSION < NTDDI_WIN10_CO) // Microsoft-specific change: // In order to avoid having to add #ifdefs everywhere these are used, since these // are private macros, we define these to nothing for versions below NTDDI_WIN10_CO // for compatibility. #define UPRV_BLOCK_MACRO_BEGIN #define UPRV_BLOCK_MACRO_END #endif // (NTDDI_VERSION < NTDDI_WIN10_CO) /** * \def UPRV_BLOCK_MACRO_BEGIN * Defined as the "do" keyword by default. * @internal */ #ifndef UPRV_BLOCK_MACRO_BEGIN #define UPRV_BLOCK_MACRO_BEGIN do #endif /** * \def UPRV_BLOCK_MACRO_END * Defined as "while (false)" by default. * @internal */ #ifndef UPRV_BLOCK_MACRO_END #define UPRV_BLOCK_MACRO_END while (false) #endif /*==========================================================================*/ /* limits for int32_t etc., like in POSIX inttypes.h */ /*==========================================================================*/ #ifndef INT8_MIN /** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */ # define INT8_MIN ((int8_t)(-128)) #endif #ifndef INT16_MIN /** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */ # define INT16_MIN ((int16_t)(-32767-1)) #endif #ifndef INT32_MIN /** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */ # define INT32_MIN ((int32_t)(-2147483647-1)) #endif #ifndef INT8_MAX /** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */ # define INT8_MAX ((int8_t)(127)) #endif #ifndef INT16_MAX /** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */ # define INT16_MAX ((int16_t)(32767)) #endif #ifndef INT32_MAX /** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */ # define INT32_MAX ((int32_t)(2147483647)) #endif #ifndef UINT8_MAX /** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT8_MAX ((uint8_t)(255U)) #endif #ifndef UINT16_MAX /** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT16_MAX ((uint16_t)(65535U)) #endif #ifndef UINT32_MAX /** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT32_MAX ((uint32_t)(4294967295U)) #endif #if defined(U_INT64_T_UNAVAILABLE) # error int64_t is required for decimal format and rule-based number format. #else # ifndef INT64_C /** * Provides a platform independent way to specify a signed 64-bit integer constant. * note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C * @stable ICU 2.8 */ # define INT64_C(c) c ## LL # endif # ifndef UINT64_C /** * Provides a platform independent way to specify an unsigned 64-bit integer constant. * note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C * @stable ICU 2.8 */ # define UINT64_C(c) c ## ULL # endif # ifndef U_INT64_MIN /** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */ # define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1)) # endif # ifndef U_INT64_MAX /** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */ # define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807))) # endif # ifndef U_UINT64_MAX /** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */ # define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615))) # endif #endif /*==========================================================================*/ /* Boolean data type */ /*==========================================================================*/ /** * The ICU boolean type, a signed-byte integer. * ICU-specific for historical reasons: The C and C++ standards used to not define type bool. * Also provides a fixed type definition, as opposed to * type bool whose details (e.g., sizeof) may vary by compiler and between C and C++. * * @stable ICU 2.0 */ typedef int8_t UBool; #if (NTDDI_VERSION < NTDDI_WIN10_CO) // Microsoft-specific change: // ICU previously defined FALSE=0 & TRUE=1 in the public ICU headers, but this was changed // in ICU 68, such that they are now deprecated and no longer defined. For versions // below NTDDI_WIN10_CO, we set U_DEFINE_FALSE_AND_TRUE to 1 so they will still be defined // for compatibility. #define U_DEFINE_FALSE_AND_TRUE 1 #endif // (NTDDI_VERSION < NTDDI_WIN10_CO) /** * \def U_DEFINE_FALSE_AND_TRUE * Normally turns off defining macros FALSE=0 & TRUE=1 in public ICU headers. * These obsolete macros sometimes break compilation of other code that * defines enum constants or similar with these names. * C++ has long defined bool/false/true. * C99 also added definitions for these, although as macros; see stdbool.h. * * You may transitionally define U_DEFINE_FALSE_AND_TRUE=1 if you need time to migrate code. * * @internal ICU 68 */ #ifdef U_DEFINE_FALSE_AND_TRUE // Use the predefined value. #elif defined(U_COMBINED_IMPLEMENTATION) || \ defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || \ defined(U_IO_IMPLEMENTATION) || defined(U_LAYOUTEX_IMPLEMENTATION) || \ defined(U_TOOLUTIL_IMPLEMENTATION) // Inside ICU: Keep FALSE & TRUE available. # define U_DEFINE_FALSE_AND_TRUE 1 #else // Outside ICU: Avoid collision with non-macro definitions of FALSE & TRUE. # define U_DEFINE_FALSE_AND_TRUE 0 #endif #if U_DEFINE_FALSE_AND_TRUE || defined(U_IN_DOXYGEN) #ifndef TRUE /** * The TRUE value of a UBool. * * @deprecated ICU 68 Use standard "true" instead. */ # define TRUE 1 #endif #ifndef FALSE /** * The FALSE value of a UBool. * * @deprecated ICU 68 Use standard "false" instead. */ # define FALSE 0 #endif #endif // U_DEFINE_FALSE_AND_TRUE /*==========================================================================*/ /* Unicode data types */ /*==========================================================================*/ /* wchar_t-related definitions -------------------------------------------- */ /* * \def U_WCHAR_IS_UTF16 * Defined if wchar_t uses UTF-16. * * @stable ICU 2.0 */ /* * \def U_WCHAR_IS_UTF32 * Defined if wchar_t uses UTF-32. * * @stable ICU 2.0 */ #if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32) # ifdef __STDC_ISO_10646__ # if (U_SIZEOF_WCHAR_T==2) # define U_WCHAR_IS_UTF16 # elif (U_SIZEOF_WCHAR_T==4) # define U_WCHAR_IS_UTF32 # endif # elif defined __UCS2__ # if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2) # define U_WCHAR_IS_UTF16 # endif # elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__)) # if (U_SIZEOF_WCHAR_T==4) # define U_WCHAR_IS_UTF32 # endif # elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED) # define U_WCHAR_IS_UTF32 # elif U_PLATFORM_HAS_WIN32_API # define U_WCHAR_IS_UTF16 # endif #endif /* UChar and UChar32 definitions -------------------------------------------- */ /** Number of bytes in a UChar. @stable ICU 2.0 */ #define U_SIZEOF_UCHAR 2 /** * \def U_CHAR16_IS_TYPEDEF * If 1, then char16_t is a typedef and not a real type (yet) * @internal */ #if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11) // for AIX, uchar.h needs to be included # include # define U_CHAR16_IS_TYPEDEF 1 #elif defined(_MSC_VER) && (_MSC_VER < 1900) // Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type, // and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx # define U_CHAR16_IS_TYPEDEF 1 #else # define U_CHAR16_IS_TYPEDEF 0 #endif /** * \var UChar * * The base type for UTF-16 code units and pointers. * Unsigned 16-bit integer. * Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar. * * UChar is configurable by defining the macro UCHAR_TYPE * on the preprocessor or compiler command line: * -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc. * (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.) * This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16. * * The default is UChar=char16_t. * * C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type. * * In C, char16_t is a simple typedef of uint_least16_t. * ICU requires uint_least16_t=uint16_t for data memory mapping. * On macOS, char16_t is not available because the uchar.h standard header is missing. * * @stable ICU 4.4 */ #if 1 // #if 1 is normal. UChar defaults to char16_t in C++. // For configuration testing of UChar=uint16_t temporarily change this to #if 0. // The intltest Makefile #defines UCHAR_TYPE=char16_t, // so we only #define it to uint16_t if it is undefined so far. #elif !defined(UCHAR_TYPE) # define UCHAR_TYPE uint16_t #endif #if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \ defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) // Inside the ICU library code, never configurable. typedef char16_t UChar; #elif defined(UCHAR_TYPE) typedef UCHAR_TYPE UChar; #elif (U_CPLUSPLUS_VERSION >= 11) typedef char16_t UChar; #else typedef uint16_t UChar; #endif /** * \var OldUChar * Default ICU 58 definition of UChar. * A base type for UTF-16 code units and pointers. * Unsigned 16-bit integer. * * Define OldUChar to be wchar_t if that is 16 bits wide. * If wchar_t is not 16 bits wide, then define UChar to be uint16_t. * * This makes the definition of OldUChar platform-dependent * but allows direct string type compatibility with platforms with * 16-bit wchar_t types. * * This is how UChar was defined in ICU 58, for transition convenience. * Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined. * The current UChar responds to UCHAR_TYPE but OldUChar does not. * * @stable ICU 59 */ #if U_SIZEOF_WCHAR_T==2 typedef wchar_t OldUChar; #elif defined(__CHAR16_TYPE__) typedef __CHAR16_TYPE__ OldUChar; #else typedef uint16_t OldUChar; #endif /** * Define UChar32 as a type for single Unicode code points. * UChar32 is a signed 32-bit integer (same as int32_t). * * The Unicode code point range is 0..0x10ffff. * All other values (negative or >=0x110000) are illegal as Unicode code points. * They may be used as sentinel values to indicate "done", "error" * or similar non-code point conditions. * * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) * or else to be uint32_t. * That is, the definition of UChar32 was platform-dependent. * * @see U_SENTINEL * @stable ICU 2.4 */ typedef int32_t UChar32; /** * This value is intended for sentinel values for APIs that * (take or) return single code points (UChar32). * It is outside of the Unicode code point range 0..0x10ffff. * * For example, a "done" or "error" value in a new API * could be indicated with U_SENTINEL. * * ICU APIs designed before ICU 2.4 usually define service-specific "done" * values, mostly 0xffff. * Those may need to be distinguished from * actual U+ffff text contents by calling functions like * CharacterIterator::hasNext() or UnicodeString::length(). * * @return -1 * @see UChar32 * @stable ICU 2.4 */ #define U_SENTINEL (-1) #endif // utf.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: utf.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 1999sep09 * created by: Markus W. Scherer */ /** * \file * \brief C API: Code point macros * * This file defines macros for checking whether a code point is * a surrogate or a non-character etc. * * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h * and itself includes utf8.h and utf16.h after some * common definitions. * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 1 then each of these headers must be * included explicitly if their definitions are used. * * utf8.h and utf16.h define macros for efficiently getting code points * in and out of UTF-8/16 strings. * utf16.h macros have "U16_" prefixes. * utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling. * * ICU mostly processes 16-bit Unicode strings. * Most of the time, such strings are well-formed UTF-16. * Single, unpaired surrogates must be handled as well, and are treated in ICU * like regular code points where possible. * (Pairs of surrogate code points are indistinguishable from supplementary * code points encoded as pairs of supplementary code units.) * * In fact, almost all Unicode code points in normal text (>99%) * are on the BMP (<=U+ffff) and even <=U+d7ff. * ICU functions handle supplementary code points (U+10000..U+10ffff) * but are optimized for the much more frequently occurring BMP code points. * * umachine.h defines UChar to be an unsigned 16-bit integer. * Since ICU 59, ICU uses char16_t in C++, UChar only in C, * and defines UChar=char16_t by default. See the UChar API docs for details. * * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit * Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1). * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as * the definition of UChar. For details see the documentation for UChar32 itself. * * utf.h defines a small number of C macros for single Unicode code points. * These are simple checks for surrogates and non-characters. * For actual Unicode character properties see uchar.h. * * By default, string operations must be done with error checking in case * a string is not well-formed UTF-16 or UTF-8. * * The U16_ macros detect if a surrogate code unit is unpaired * (lead unit without trail unit or vice versa) and just return the unit itself * as the code point. * * The U8_ macros detect illegal byte sequences and return a negative value. * Starting with ICU 60, the observable length of a single illegal byte sequence * skipped by one of these macros follows the Unicode 6+ recommendation * which is consistent with the W3C Encoding Standard. * * There are ..._OR_FFFD versions of both U16_ and U8_ macros * that return U+FFFD for illegal code unit sequences. * * The regular "safe" macros require that the initial, passed-in string index * is within bounds. They only check the index when they read more than one * code unit. This is usually done with code similar to the following loop: *
while(i
 *
 * When it is safe to assume that text is well-formed UTF-16
 * (does not contain single, unpaired surrogates), then one can use
 * U16_..._UNSAFE macros.
 * These do not check for proper code unit sequences or truncated text and may
 * yield wrong results or even cause a crash if they are used with "malformed"
 * text.
 * In practice, U16_..._UNSAFE macros will produce slightly less code but
 * should not be faster because the processing is only different when a
 * surrogate code unit is detected, which will be rare.
 *
 * Similarly for UTF-8, there are "safe" macros without a suffix,
 * and U8_..._UNSAFE versions.
 * The performance differences are much larger here because UTF-8 provides so
 * many opportunities for malformed sequences.
 * The unsafe UTF-8 macros are entirely implemented inside the macro definitions
 * and are fast, while the safe UTF-8 macros call functions for some complicated cases.
 *
 * Unlike with UTF-16, malformed sequences cannot be expressed with distinct
 * code point values (0..U+10ffff). They are indicated with negative values instead.
 *
 * For more information see the ICU User Guide Strings chapter
 * (https://unicode-org.github.io/icu/userguide/strings).
 *
 * Usage:
 * ICU coding guidelines for if() statements should be followed when using these macros.
 * Compound statements (curly braces {}) must be used  for if-else-while... 
 * bodies and all macro statements should be terminated with semicolon.
 *
 * @stable ICU 2.4
 */

#ifndef __UTF_H__
#define __UTF_H__

/* include the utfXX.h after the following definitions */

/* single-code point definitions -------------------------------------------- */

/**
 * Is this code point a Unicode noncharacter?
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 2.4
 */
#define U_IS_UNICODE_NONCHAR(c) \
    ((c)>=0xfdd0 && \
     ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)

/**
 * Is c a Unicode code point value (0..U+10ffff)
 * that can be assigned a character?
 *
 * Code points that are not characters include:
 * - single surrogate code points (U+d800..U+dfff, 2048 code points)
 * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
 * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
 * - the highest Unicode code point value is U+10ffff
 *
 * This means that all code points below U+d800 are character code points,
 * and that boundary is tested first for performance.
 *
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 2.4
 */
#define U_IS_UNICODE_CHAR(c) \
    ((uint32_t)(c)<0xd800 || \
        (0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))

/**
 * Is this code point a BMP code point (U+0000..U+ffff)?
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 2.8
 */
#define U_IS_BMP(c) ((uint32_t)(c)<=0xffff)

/**
 * Is this code point a supplementary code point (U+10000..U+10ffff)?
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 2.8
 */
#define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000)<=0xfffff)
 
/**
 * Is this code point a lead surrogate (U+d800..U+dbff)?
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 2.4
 */
#define U_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

/**
 * Is this code point a trail surrogate (U+dc00..U+dfff)?
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 2.4
 */
#define U_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

/**
 * Is this code point a surrogate (U+d800..U+dfff)?
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 2.4
 */
#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)

/**
 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
 * is it a lead surrogate?
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 2.4
 */
#define U_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)

/**
 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
 * is it a trail surrogate?
 * @param c 32-bit code point
 * @return true or false
 * @stable ICU 4.2
 */
#define U_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)

/* include the utfXX.h ------------------------------------------------------ */


#endif  /* __UTF_H__ */

// utf8.h
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 1999-2015, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utf8.h
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 1999sep13
*   created by: Markus W. Scherer
*/

/**
 * \file
 * \brief C API: 8-bit Unicode handling macros
 * 
 * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
 *
 * For more information see utf.h and the ICU User Guide Strings chapter
 * (https://unicode-org.github.io/icu/userguide/strings).
 *
 * Usage:
 * ICU coding guidelines for if() statements should be followed when using these macros.
 * Compound statements (curly braces {}) must be used  for if-else-while... 
 * bodies and all macro statements should be terminated with semicolon.
 */

#ifndef __UTF8_H__
#define __UTF8_H__

#ifndef __UTF_H__
#   include "icu/utf.h"
#endif

/* internal definitions ----------------------------------------------------- */

/**
 * Counts the trail bytes for a UTF-8 lead byte.
 * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
 * leadByte might be evaluated multiple times.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this file and thus must remain stable.
 *
 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
 * @internal
 */
#define U8_COUNT_TRAIL_BYTES(leadByte) \
    (U8_IS_LEAD(leadByte) ? \
        ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)

/**
 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
 * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
 * leadByte might be evaluated multiple times.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this file and thus must remain stable.
 *
 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
 * @internal
 */
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
    (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))

/**
 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this file and thus must remain stable.
 * @internal
 */
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)

/**
 * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
 * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
 * Lead byte E0..EF bits 3..0 are used as byte index,
 * first trail byte bits 7..5 are used as bit index into that byte.
 * @see U8_IS_VALID_LEAD3_AND_T1
 * @internal
 */
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"

/**
 * Internal 3-byte UTF-8 validity check.
 * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
 * @internal
 */
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))

/**
 * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
 * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
 * First trail byte bits 7..4 are used as byte index,
 * lead byte F0..F4 bits 2..0 are used as bit index into that byte.
 * @see U8_IS_VALID_LEAD4_AND_T1
 * @internal
 */
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"

/**
 * Internal 4-byte UTF-8 validity check.
 * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
 * @internal
 */
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))

/**
 * Function for handling "next code point" with error-checking.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this
 * file and thus must remain stable, and should not be hidden when other internal
 * functions are hidden (otherwise public macros would fail to compile).
 * @internal
 */
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);

/**
 * Function for handling "append code point" with error-checking.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this
 * file and thus must remain stable, and should not be hidden when other internal
 * functions are hidden (otherwise public macros would fail to compile).
 * @internal
 */
U_CAPI int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);

/**
 * Function for handling "previous code point" with error-checking.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this
 * file and thus must remain stable, and should not be hidden when other internal
 * functions are hidden (otherwise public macros would fail to compile).
 * @internal
 */
U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);

/**
 * Function for handling "skip backward one code point" with error-checking.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this
 * file and thus must remain stable, and should not be hidden when other internal
 * functions are hidden (otherwise public macros would fail to compile).
 * @internal
 */
U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);

/* single-code point definitions -------------------------------------------- */

/**
 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
 * @param c 8-bit code unit (byte)
 * @return true or false
 * @stable ICU 2.4
 */
#define U8_IS_SINGLE(c) (((c)&0x80)==0)

/**
 * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
 * @param c 8-bit code unit (byte)
 * @return true or false
 * @stable ICU 2.4
 */
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
// 0x32=0xf4-0xc2

/**
 * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
 * @param c 8-bit code unit (byte)
 * @return true or false
 * @stable ICU 2.4
 */
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)

/**
 * How many code units (bytes) are used for the UTF-8 encoding
 * of this Unicode code point?
 * @param c 32-bit code point
 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
 * @stable ICU 2.4
 */
#define U8_LENGTH(c) \
    ((uint32_t)(c)<=0x7f ? 1 : \
        ((uint32_t)(c)<=0x7ff ? 2 : \
            ((uint32_t)(c)<=0xd7ff ? 3 : \
                ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
                    ((uint32_t)(c)<=0xffff ? 3 : 4)\
                ) \
            ) \
        ) \
    )

/**
 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
 * @return 4
 * @stable ICU 2.4
 */
#define U8_MAX_LENGTH 4

/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * The offset may point to either the lead byte or one of the trail bytes
 * for a code point, in which case the macro will read all of the bytes
 * for the code point.
 * The result is undefined if the offset points to an illegal UTF-8
 * byte sequence.
 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U8_GET
 * @stable ICU 2.4
 */
#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
    int32_t _u8_get_unsafe_index=(int32_t)(i); \
    U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
    U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
} UPRV_BLOCK_MACRO_END

/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * The offset may point to either the lead byte or one of the trail bytes
 * for a code point, in which case the macro will read all of the bytes
 * for the code point.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * If the offset points to an illegal UTF-8 byte sequence, then
 * c is set to a negative value.
 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset
 * @param i int32_t string offset, must be start<=i= NTDDI_WIN10_RS5)
#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
#elif (NTDDI_VERSION >= NTDDI_WIN10_RS3)
#define U8_NEXT(s, i, length, c) { \
    (c)=(uint8_t)(s)[(i)++]; \
    if(!U8_IS_SINGLE(c)) { \
        uint8_t __t1, __t2; \
        if( /* handle U+0800..U+FFFF inline */ \
                (0xe0<=(c) && (c)<0xf0) && \
                (((i)+1)<(length) || (length)<0) && \
                U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
                (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
            (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
            (i)+=2; \
        } else if( /* handle U+0080..U+07FF inline */ \
                ((c)<0xe0 && (c)>=0xc2) && \
                ((i)!=(length)) && \
                (__t1=(s)[i]-0x80)<=0x3f) { \
            (c)=(((c)&0x1f)<<6)|__t1; \
            ++(i); \
        } else { \
            /* function call for "complicated" and error cases */ \
            (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
        } \
    } \
}
#endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5)

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * The offset may point to the lead byte of a multi-byte sequence,
 * in which case the macro will read the whole sequence.
 * If the offset points to a trail byte or an illegal UTF-8 sequence, then
 * c is set to U+FFFD.
 *
 * This macro does not distinguish between a real U+FFFD in the text
 * and U+FFFD returned for an ill-formed sequence.
 * Use U8_NEXT() if that distinction is important.
 *
 * @param s const uint8_t * string
 * @param i int32_t string offset, must be i= NTDDI_WIN10_RS5)
#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
#elif (NTDDI_VERSION >= NTDDI_WIN10_RS3)
#define U8_NEXT_OR_FFFD(s, i, length, c) { \
    (c)=(uint8_t)(s)[(i)++]; \
    if(!U8_IS_SINGLE(c)) { \
        uint8_t __t1, __t2; \
        if( /* handle U+0800..U+FFFF inline */ \
                (0xe0<=(c) && (c)<0xf0) && \
                (((i)+1)<(length) || (length)<0) && \
                U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
                (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
            (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
            (i)+=2; \
        } else if( /* handle U+0080..U+07FF inline */ \
                ((c)<0xe0 && (c)>=0xc2) && \
                ((i)!=(length)) && \
                (__t1=(s)[i]-0x80)<=0x3f) { \
            (c)=(((c)&0x1f)<<6)|__t1; \
            ++(i); \
        } else { \
            /* function call for "complicated" and error cases */ \
            (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \
        } \
    } \
}
#endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5)

#if (NTDDI_VERSION >= NTDDI_WIN10_RS5)
/** @internal */
#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(uint8_t)(s)[(i)++]; \
    if(!U8_IS_SINGLE(c)) { \
        uint8_t __t = 0; \
        if((i)!=(length) && \
            /* fetch/validate/assemble all but last trail byte */ \
            ((c)>=0xe0 ? \
                ((c)<0xf0 ?  /* U+0800..U+FFFF except surrogates */ \
                    U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
                    (__t&=0x3f, 1) \
                :  /* U+10000..U+10FFFF */ \
                    ((c)-=0xf0)<=4 && \
                    U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
                    ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
                    (__t=(s)[i]-0x80)<=0x3f) && \
                /* valid second-to-last trail byte */ \
                ((c)=((c)<<6)|__t, ++(i)!=(length)) \
            :  /* U+0080..U+07FF */ \
                (c)>=0xc2 && ((c)&=0x1f, 1)) && \
            /* last trail byte */ \
            (__t=(s)[i]-0x80)<=0x3f && \
            ((c)=((c)<<6)|__t, ++(i), 1)) { \
        } else { \
            (c)=(sub);  /* ill-formed*/ \
        } \
    } \
} UPRV_BLOCK_MACRO_END
#endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5)

/**
 * Append a code point to a string, overwriting 1 to 4 bytes.
 * The offset points to the current end of the string contents
 * and is advanced (post-increment).
 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
 * Otherwise, the result is undefined.
 *
 * @param s const uint8_t * string buffer
 * @param i string offset
 * @param c code point to append
 * @see U8_APPEND
 * @stable ICU 2.4
 */
#if (NTDDI_VERSION >= NTDDI_WIN10_RS5)
#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
    uint32_t __uc=(c); \
    if(__uc<=0x7f) { \
        (s)[(i)++]=(uint8_t)__uc; \
    } else { \
        if(__uc<=0x7ff) { \
            (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
        } else { \
            if(__uc<=0xffff) { \
                (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
            } else { \
                (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
                (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
            } \
            (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        } \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } \
} UPRV_BLOCK_MACRO_END
#elif (NTDDI_VERSION >= NTDDI_WIN10_RS3)
#define U8_APPEND_UNSAFE(s, i, c) { \
    if((uint32_t)(c)<=0x7f) { \
        (s)[(i)++]=(uint8_t)(c); \
    } else { \
        if((uint32_t)(c)<=0x7ff) { \
            (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
        } else { \
            if((uint32_t)(c)<=0xffff) { \
                (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
            } else { \
                (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
                (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
            } \
            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
        } \
        (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
    } \
}
#endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5)

/**
 * Append a code point to a string, overwriting 1 to 4 bytes.
 * The offset points to the current end of the string contents
 * and is advanced (post-increment).
 * "Safe" macro, checks for a valid code point.
 * If a non-ASCII code point is written, checks for sufficient space in the string.
 * If the code point is not valid or trail bytes do not fit,
 * then isError is set to true.
 *
 * @param s const uint8_t * string buffer
 * @param i int32_t string offset, must be i= NTDDI_WIN10_CO)
#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
    uint32_t __uc=(c); \
    if(__uc<=0x7f) { \
        (s)[(i)++]=(uint8_t)__uc; \
    } else if(__uc<=0x7ff && (i)+1<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
        (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
        (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else { \
        (isError)=true; \
    } \
} UPRV_BLOCK_MACRO_END
#elif (NTDDI_VERSION >= NTDDI_WIN10_RS5)
#define U8_APPEND(s, i, capacity, c, isError) { \
    uint32_t __uc=(c); \
    if(__uc<=0x7f) { \
        (s)[(i)++]=(uint8_t)__uc; \
    } else if(__uc<=0x7ff && (i)+1<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
        (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
        (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else { \
        (isError)=TRUE; \
    } \
}

#elif (NTDDI_VERSION >= NTDDI_WIN10_RS3)
#define U8_APPEND(s, i, capacity, c, isError) { \
    if((uint32_t)(c)<=0x7f) { \
        (s)[(i)++]=(uint8_t)(c); \
    } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \
        (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
        (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
    } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \
        (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
        (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
    } else { \
        (i)=utf8_appendCharSafeBody(s, (i), (capacity), c, &(isError)); \
    } \
}
#endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5)

/**
 * Advance the string offset from one code point boundary to the next.
 * (Post-incrementing iteration.)
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @see U8_FWD_1
 * @stable ICU 2.4
 */
#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
    (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
} UPRV_BLOCK_MACRO_END

/**
 * Advance the string offset from one code point boundary to the next.
 * (Post-incrementing iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const uint8_t * string
 * @param i int32_t string offset, must be i=0xf0 */ { \
            if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
                    ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
                    ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
                ++(i); \
            } \
        } \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Advance the string offset from one code point boundary to the n-th next one,
 * i.e., move forward by n code points.
 * (Post-incrementing iteration.)
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @param n number of code points to skip
 * @see U8_FWD_N
 * @stable ICU 2.4
 */
#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
    int32_t __N=(n); \
    while(__N>0) { \
        U8_FWD_1_UNSAFE(s, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Advance the string offset from one code point boundary to the n-th next one,
 * i.e., move forward by n code points.
 * (Post-incrementing iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const uint8_t * string
 * @param i int32_t string offset, must be i0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
        U8_FWD_1(s, i, length); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Adjust a random-access offset to a code point boundary
 * at the start of a code point.
 * If the offset points to a UTF-8 trail byte,
 * then the offset is moved backward to the corresponding lead byte.
 * Otherwise, it is not modified.
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @see U8_SET_CP_START
 * @stable ICU 2.4
 */
#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
    while(U8_IS_TRAIL((s)[i])) { --(i); } \
} UPRV_BLOCK_MACRO_END

/**
 * Adjust a random-access offset to a code point boundary
 * at the start of a code point.
 * If the offset points to a UTF-8 trail byte,
 * then the offset is moved backward to the corresponding lead byte.
 * Otherwise, it is not modified.
 *
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, must be start<=i
 * @see U8_SET_CP_START_UNSAFE
 * @see U8_TRUNCATE_IF_INCOMPLETE
 * @stable ICU 2.4
 */
#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
    if(U8_IS_TRAIL((s)[(i)])) { \
        (i)=utf8_back1SafeBody(s, start, (i)); \
    } \
} UPRV_BLOCK_MACRO_END

#if (NTDDI_VERSION >= NTDDI_WIN10_VB)
/**
 * If the string ends with a UTF-8 byte sequence that is valid so far
 * but incomplete, then reduce the length of the string to end before
 * the lead byte of that incomplete sequence.
 * For example, if the string ends with E1 80, the length is reduced by 2.
 *
 * In all other cases (the string ends with a complete sequence, or it is not
 * possible for any further trail byte to extend the trailing sequence)
 * the length remains unchanged.
 *
 * Useful for processing text split across multiple buffers
 * (save the incomplete sequence for later)
 * and for optimizing iteration
 * (check for string length only once per character).
 *
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 * Unlike U8_SET_CP_START(), this macro never reads s[length].
 *
 * (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param length int32_t string length (usually start<=length)
 * @see U8_SET_CP_START
 * @stable ICU 61
 */
#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
    if((length)>(start)) { \
        uint8_t __b1=s[(length)-1]; \
        if(U8_IS_SINGLE(__b1)) { \
            /* common ASCII character */ \
        } else if(U8_IS_LEAD(__b1)) { \
            --(length); \
        } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
            uint8_t __b2=s[(length)-2]; \
            if(0xe0<=__b2 && __b2<=0xf4) { \
                if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
                        U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
                    (length)-=2; \
                } \
            } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
                uint8_t __b3=s[(length)-3]; \
                if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
                    (length)-=3; \
                } \
            } \
        } \
    } \
} UPRV_BLOCK_MACRO_END
#endif // (NTDDI_VERSION >= NTDDI_WIN10_VB)

/* definitions with backward iteration -------------------------------------- */

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a multi-byte sequence, then the macro will read
 * the whole sequence.
 * If the offset is behind a lead byte, then that itself
 * will be returned as the code point.
 * The result is undefined if the offset is behind an illegal UTF-8 sequence.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U8_PREV
 * @stable ICU 2.4
 */
#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(uint8_t)(s)[--(i)]; \
    if(U8_IS_TRAIL(c)) { \
        uint8_t __b, __count=1, __shift=6; \
\
        /* c is a trail byte */ \
        (c)&=0x3f; \
        for(;;) { \
            __b=(s)[--(i)]; \
            if(__b>=0xc0) { \
                U8_MASK_LEAD_BYTE(__b, __count); \
                (c)|=(UChar32)__b<<__shift; \
                break; \
            } else { \
                (c)|=(UChar32)(__b&0x3f)<<__shift; \
                ++__count; \
                __shift+=6; \
            } \
        } \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a multi-byte sequence, then the macro will read
 * the whole sequence.
 * If the offset is behind a lead byte, then that itself
 * will be returned as the code point.
 * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, must be start0) { \
        U8_BACK_1_UNSAFE(s, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Move the string offset from one code point boundary to the n-th one before it,
 * i.e., move backward by n code points.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * @param s const uint8_t * string
 * @param start int32_t index of the start of the string
 * @param i int32_t string offset, must be start0 && (i)>(start)) { \
        U8_BACK_1(s, start, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Adjust a random-access offset to a code point boundary after a code point.
 * If the offset is behind a partial multi-byte sequence,
 * then the offset is incremented to behind the whole sequence.
 * Otherwise, it is not modified.
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @see U8_SET_CP_LIMIT
 * @stable ICU 2.4
 */
#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
    U8_BACK_1_UNSAFE(s, i); \
    U8_FWD_1_UNSAFE(s, i); \
} UPRV_BLOCK_MACRO_END

/**
 * Adjust a random-access offset to a code point boundary after a code point.
 * If the offset is behind a partial multi-byte sequence,
 * then the offset is incremented to behind the whole sequence.
 * Otherwise, it is not modified.
 * The input offset may be the same as the string length.
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, must be start<=i<=length
 * @param length int32_t string length
 * @see U8_SET_CP_LIMIT_UNSAFE
 * @stable ICU 2.4
 */
#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
    if((start)<(i) && ((i)<(length) || (length)<0)) { \
        U8_BACK_1(s, start, i); \
        U8_FWD_1(s, i, length); \
    } \
} UPRV_BLOCK_MACRO_END

#endif

// utf16.h
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 1999-2012, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utf16.h
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 1999sep09
*   created by: Markus W. Scherer
*/

/**
 * \file
 * \brief C API: 16-bit Unicode handling macros
 * 
 * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
 *
 * For more information see utf.h and the ICU User Guide Strings chapter
 * (https://unicode-org.github.io/icu/userguide/strings).
 *
 * Usage:
 * ICU coding guidelines for if() statements should be followed when using these macros.
 * Compound statements (curly braces {}) must be used  for if-else-while... 
 * bodies and all macro statements should be terminated with semicolon.
 */

#ifndef __UTF16_H__
#define __UTF16_H__

#ifndef __UTF_H__
#   include "icu/utf.h"
#endif

/* single-code point definitions -------------------------------------------- */

/**
 * Does this code unit alone encode a code point (BMP, not a surrogate)?
 * @param c 16-bit code unit
 * @return true or false
 * @stable ICU 2.4
 */
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)

/**
 * Is this code unit a lead surrogate (U+d800..U+dbff)?
 * @param c 16-bit code unit
 * @return true or false
 * @stable ICU 2.4
 */
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

/**
 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
 * @param c 16-bit code unit
 * @return true or false
 * @stable ICU 2.4
 */
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

/**
 * Is this code unit a surrogate (U+d800..U+dfff)?
 * @param c 16-bit code unit
 * @return true or false
 * @stable ICU 2.4
 */
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)

/**
 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
 * is it a lead surrogate?
 * @param c 16-bit code unit
 * @return true or false
 * @stable ICU 2.4
 */
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)

/**
 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
 * is it a trail surrogate?
 * @param c 16-bit code unit
 * @return true or false
 * @stable ICU 4.2
 */
#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)

/**
 * Helper constant for U16_GET_SUPPLEMENTARY.
 * @internal
 */
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)

/**
 * Get a supplementary code point value (U+10000..U+10ffff)
 * from its lead and trail surrogates.
 * The result is undefined if the input values are not
 * lead and trail surrogates.
 *
 * @param lead lead surrogate (U+d800..U+dbff)
 * @param trail trail surrogate (U+dc00..U+dfff)
 * @return supplementary code point (U+10000..U+10ffff)
 * @stable ICU 2.4
 */
#define U16_GET_SUPPLEMENTARY(lead, trail) \
    (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)


/**
 * Get the lead surrogate (0xd800..0xdbff) for a
 * supplementary code point (0x10000..0x10ffff).
 * @param supplementary 32-bit code point (U+10000..U+10ffff)
 * @return lead surrogate (U+d800..U+dbff) for supplementary
 * @stable ICU 2.4
 */
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)

/**
 * Get the trail surrogate (0xdc00..0xdfff) for a
 * supplementary code point (0x10000..0x10ffff).
 * @param supplementary 32-bit code point (U+10000..U+10ffff)
 * @return trail surrogate (U+dc00..U+dfff) for supplementary
 * @stable ICU 2.4
 */
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)

/**
 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
 * @param c 32-bit code point
 * @return 1 or 2
 * @stable ICU 2.4
 */
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)

/**
 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
 * @return 2
 * @stable ICU 2.4
 */
#define U16_MAX_LENGTH 2

/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * The offset may point to either the lead or trail surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the adjacent matching surrogate as well.
 * The result is undefined if the offset points to a single, unpaired surrogate.
 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U16_GET
 * @stable ICU 2.4
 */
#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(s)[i]; \
    if(U16_IS_SURROGATE(c)) { \
        if(U16_IS_SURROGATE_LEAD(c)) { \
            (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
        } else { \
            (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
        } \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The offset may point to either the lead or trail surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the adjacent matching surrogate as well.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * If the offset points to a single, unpaired surrogate, then
 * c is set to that unpaired surrogate.
 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start<=i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
                (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
            } \
        } \
    } \
} UPRV_BLOCK_MACRO_END

#if (NTDDI_VERSION >= NTDDI_WIN10_19H1)
/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The offset may point to either the lead or trail surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the adjacent matching surrogate as well.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * If the offset points to a single, unpaired surrogate, then
 * c is set to U+FFFD.
 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start<=i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
                (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
            } else { \
                (c)=0xfffd; \
            } \
        } \
    } \
} UPRV_BLOCK_MACRO_END
#endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1)

/* definitions with forward iteration --------------------------------------- */

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate, then that itself
 * will be returned as the code point.
 * The result is undefined if the offset points to a single, unpaired lead surrogate.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U16_NEXT
 * @stable ICU 2.4
 */
#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(s)[(i)++]; \
    if(U16_IS_LEAD(c)) { \
        (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate or
 * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
 *
 * @param s const UChar * string
 * @param i string offset, must be i= NTDDI_WIN10_19H1)
/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate or
 * to a single, unpaired lead surrogate, then c is set to U+FFFD.
 *
 * @param s const UChar * string
 * @param i string offset, must be i= NTDDI_WIN10_19H1)

/**
 * Append a code point to a string, overwriting 1 or 2 code units.
 * The offset points to the current end of the string contents
 * and is advanced (post-increment).
 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
 * Otherwise, the result is undefined.
 *
 * @param s const UChar * string buffer
 * @param i string offset
 * @param c code point to append
 * @see U16_APPEND
 * @stable ICU 2.4
 */
#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
    if((uint32_t)(c)<=0xffff) { \
        (s)[(i)++]=(uint16_t)(c); \
    } else { \
        (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
        (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Append a code point to a string, overwriting 1 or 2 code units.
 * The offset points to the current end of the string contents
 * and is advanced (post-increment).
 * "Safe" macro, checks for a valid code point.
 * If a surrogate pair is written, checks for sufficient space in the string.
 * If the code point is not valid or a trail surrogate does not fit,
 * then isError is set to true.
 *
 * @param s const UChar * string buffer
 * @param i string offset, must be i= NTDDI_WIN10_CO)
#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
    if((uint32_t)(c)<=0xffff) { \
        (s)[(i)++]=(uint16_t)(c); \
    } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
        (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
        (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    } else /* c>0x10ffff or not enough space */ { \
        (isError)=true; \
    } \
} UPRV_BLOCK_MACRO_END
#else
#define U16_APPEND(s, i, capacity, c, isError) { \
    if((uint32_t)(c)<=0xffff) { \
        (s)[(i)++]=(uint16_t)(c); \
    } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
        (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
        (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    } else /* c>0x10ffff or not enough space */ { \
        (isError)=TRUE; \
    } \
}
#endif

/**
 * Advance the string offset from one code point boundary to the next.
 * (Post-incrementing iteration.)
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @see U16_FWD_1
 * @stable ICU 2.4
 */
#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
    if(U16_IS_LEAD((s)[(i)++])) { \
        ++(i); \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Advance the string offset from one code point boundary to the next.
 * (Post-incrementing iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const UChar * string
 * @param i string offset, must be i0) { \
        U16_FWD_1_UNSAFE(s, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Advance the string offset from one code point boundary to the n-th next one,
 * i.e., move forward by n code points.
 * (Post-incrementing iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const UChar * string
 * @param i int32_t string offset, must be i0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
        U16_FWD_1(s, i, length); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Adjust a random-access offset to a code point boundary
 * at the start of a code point.
 * If the offset points to the trail surrogate of a surrogate pair,
 * then the offset is decremented.
 * Otherwise, it is not modified.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @see U16_SET_CP_START
 * @stable ICU 2.4
 */
#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
    if(U16_IS_TRAIL((s)[i])) { \
        --(i); \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Adjust a random-access offset to a code point boundary
 * at the start of a code point.
 * If the offset points to the trail surrogate of a surrogate pair,
 * then the offset is decremented.
 * Otherwise, it is not modified.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start<=i
 * @see U16_SET_CP_START_UNSAFE
 * @stable ICU 2.4
 */
#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
    if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
        --(i); \
    } \
} UPRV_BLOCK_MACRO_END

/* definitions with backward iteration -------------------------------------- */

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a trail surrogate unit
 * for a supplementary code point, then the macro will read
 * the preceding lead surrogate as well.
 * If the offset is behind a lead surrogate, then that itself
 * will be returned as the code point.
 * The result is undefined if the offset is behind a single, unpaired trail surrogate.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U16_PREV
 * @stable ICU 2.4
 */
#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(s)[--(i)]; \
    if(U16_IS_TRAIL(c)) { \
        (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a trail surrogate unit
 * for a supplementary code point, then the macro will read
 * the preceding lead surrogate as well.
 * If the offset is behind a lead surrogate or behind a single, unpaired
 * trail surrogate, then c is set to that unpaired surrogate.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
            --(i); \
            (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
        } \
    } \
} UPRV_BLOCK_MACRO_END

#if (NTDDI_VERSION >= NTDDI_WIN10_19H1)
/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a trail surrogate unit
 * for a supplementary code point, then the macro will read
 * the preceding lead surrogate as well.
 * If the offset is behind a lead surrogate or behind a single, unpaired
 * trail surrogate, then c is set to U+FFFD.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
            --(i); \
            (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
        } else { \
            (c)=0xfffd; \
        } \
    } \
} UPRV_BLOCK_MACRO_END
#endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1)

/**
 * Move the string offset from one code point boundary to the previous one.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @see U16_BACK_1
 * @stable ICU 2.4
 */
#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
    if(U16_IS_TRAIL((s)[--(i)])) { \
        --(i); \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Move the string offset from one code point boundary to the previous one.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start(start) && U16_IS_LEAD((s)[(i)-1])) { \
        --(i); \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Move the string offset from one code point boundary to the n-th one before it,
 * i.e., move backward by n code points.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param n number of code points to skip
 * @see U16_BACK_N
 * @stable ICU 2.4
 */
#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
    int32_t __N=(n); \
    while(__N>0) { \
        U16_BACK_1_UNSAFE(s, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Move the string offset from one code point boundary to the n-th one before it,
 * i.e., move backward by n code points.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * @param s const UChar * string
 * @param start start of string
 * @param i string offset, must be start0 && (i)>(start)) { \
        U16_BACK_1(s, start, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Adjust a random-access offset to a code point boundary after a code point.
 * If the offset is behind the lead surrogate of a surrogate pair,
 * then the offset is incremented.
 * Otherwise, it is not modified.
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @see U16_SET_CP_LIMIT
 * @stable ICU 2.4
 */
#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
    if(U16_IS_LEAD((s)[(i)-1])) { \
        ++(i); \
    } \
} UPRV_BLOCK_MACRO_END

/**
 * Adjust a random-access offset to a code point boundary after a code point.
 * If the offset is behind the lead surrogate of a surrogate pair,
 * then the offset is incremented.
 * Otherwise, it is not modified.
 * The input offset may be the same as the string length.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const UChar * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, start<=i<=length
 * @param length int32_t string length
 * @see U16_SET_CP_LIMIT_UNSAFE
 * @stable ICU 2.4
 */
#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
    if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
        ++(i); \
    } \
} UPRV_BLOCK_MACRO_END

#endif

// utf_old.h
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 2002-2012, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utf_old.h
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002sep21
*   created by: Markus W. Scherer
*/

/**
 * \file
 * \brief C API: Deprecated macros for Unicode string handling
 *
 * The macros in utf_old.h are all deprecated and their use discouraged.
 * Some of the design principles behind the set of UTF macros
 * have changed or proved impractical.
 * Almost all of the old "UTF macros" are at least renamed.
 * If you are looking for a new equivalent to an old macro, please see the
 * comment at the old one.
 *
 * Brief summary of reasons for deprecation:
 * - Switch on UTF_SIZE (selection of UTF-8/16/32 default string processing)
 *   was impractical.
 * - Switch on UTF_SAFE etc. (selection of unsafe/safe/strict default string processing)
 *   was of little use and impractical.
 * - Whole classes of macros became obsolete outside of the UTF_SIZE/UTF_SAFE
 *   selection framework: UTF32_ macros (all trivial)
 *   and UTF_ default and intermediate macros (all aliases).
 * - The selection framework also caused many macro aliases.
 * - Change in Unicode standard: "irregular" sequences (3.0) became illegal (3.2).
 * - Change of language in Unicode standard:
 *   Growing distinction between internal x-bit Unicode strings and external UTF-x
 *   forms, with the former more lenient.
 *   Suggests renaming of UTF16_ macros to U16_.
 * - The prefix "UTF_" without a width number confused some users.
 * - "Safe" append macros needed the addition of an error indicator output.
 * - "Safe" UTF-8 macros used legitimate (if rarely used) code point values
 *   to indicate error conditions.
 * - The use of the "_CHAR" infix for code point operations confused some users.
 *
 * More details:
 *
 * Until ICU 2.2, utf.h theoretically allowed to choose among UTF-8/16/32
 * for string processing, and among unsafe/safe/strict default macros for that.
 *
 * It proved nearly impossible to write non-trivial, high-performance code
 * that is UTF-generic.
 * Unsafe default macros would be dangerous for default string processing,
 * and the main reason for the "strict" versions disappeared:
 * Between Unicode 3.0 and 3.2 all "irregular" UTF-8 sequences became illegal.
 * The only other conditions that "strict" checked for were non-characters,
 * which are valid during processing. Only during text input/output should they
 * be checked, and at that time other well-formedness checks may be
 * necessary or useful as well.
 * This can still be done by using U16_NEXT and U_IS_UNICODE_NONCHAR
 * or U_IS_UNICODE_CHAR.
 *
 * The old UTF8_..._SAFE macros also used some normal Unicode code points
 * to indicate malformed sequences.
 * The new UTF8_ macros without suffix use negative values instead.
 *
 * The entire contents of utf32.h was moved here without replacement
 * because all those macros were trivial and
 * were meaningful only in the framework of choosing the UTF size.
 *
 * See Jitterbug 2150 and its discussion on the ICU mailing list
 * in September 2002.
 *
 * 
* * Obsolete part of pre-ICU 2.4 utf.h file documentation: * *

The original concept for these files was for ICU to allow * in principle to set which UTF (UTF-8/16/32) is used internally * by defining UTF_SIZE to either 8, 16, or 32. utf.h would then define the UChar type * accordingly. UTF-16 was the default.

* *

This concept has been abandoned. * A lot of the ICU source code assumes UChar strings are in UTF-16. * This is especially true for low-level code like * conversion, normalization, and collation. * The utf.h header enforces the default of UTF-16. * The UTF-8 and UTF-32 macros remain for now for completeness and backward compatibility.

* *

Accordingly, utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then * UChar is defined to be exactly wchar_t, otherwise uint16_t.

* *

UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit * Unicode code point (Unicode scalar value, 0..0x10ffff). * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as * the definition of UChar. For details see the documentation for UChar32 itself.

* *

utf.h also defines a number of C macros for handling single Unicode code points and * for using UTF Unicode strings. It includes utf8.h, utf16.h, and utf32.h for the actual * implementations of those macros and then aliases one set of them (for UTF-16) for general use. * The UTF-specific macros have the UTF size in the macro name prefixes (UTF16_...), while * the general alias macros always begin with UTF_...

* *

Many string operations can be done with or without error checking. * Where such a distinction is useful, there are two versions of the macros, "unsafe" and "safe" * ones with ..._UNSAFE and ..._SAFE suffixes. The unsafe macros are fast but may cause * program failures if the strings are not well-formed. The safe macros have an additional, boolean * parameter "strict". If strict is false, then only illegal sequences are detected. * Otherwise, irregular sequences and non-characters are detected as well (like single surrogates). * Safe macros return special error code points for illegal/irregular sequences: * Typically, U+ffff, or values that would result in a code unit sequence of the same length * as the erroneous input sequence.
* Note that _UNSAFE macros have fewer parameters: They do not have the strictness parameter, and * they do not have start/length parameters for boundary checking.

* *

Here, the macros are aliased in two steps: * In the first step, the UTF-specific macros with UTF16_ prefix and _UNSAFE and _SAFE suffixes are * aliased according to the UTF_SIZE to macros with UTF_ prefix and the same suffixes and signatures. * Then, in a second step, the default, general alias macros are set to use either the unsafe or * the safe/not strict (default) or the safe/strict macro; * these general macros do not have a strictness parameter.

* *

It is possible to change the default choice for the general alias macros to be unsafe, safe/not strict or safe/strict. * The default is safe/not strict. It is not recommended to select the unsafe macros as the basis for * Unicode string handling in ICU! To select this, define UTF_SAFE, UTF_STRICT, or UTF_UNSAFE.

* *

For general use, one should use the default, general macros with UTF_ prefix and no _SAFE/_UNSAFE suffix. * Only in some cases it may be necessary to control the choice of macro directly and use a less generic alias. * For example, if it can be assumed that a string is well-formed and the index will stay within the bounds, * then the _UNSAFE version may be used. * If a UTF-8 string is to be processed, then the macros with UTF8_ prefixes need to be used.

* *
* * Deprecated ICU 2.4. Use the macros in utf.h, utf16.h, utf8.h instead. */ #ifndef __UTF_OLD_H__ #define __UTF_OLD_H__ /** * \def U_HIDE_OBSOLETE_UTF_OLD_H * * Hides the obsolete definitions in unicode/utf_old.h. * Recommended to be set to 1 at compile time to make sure * the long-deprecated macros are no longer used. * * For reasons for the deprecation see the utf_old.h file comments. * * @internal */ #ifndef U_HIDE_OBSOLETE_UTF_OLD_H # define U_HIDE_OBSOLETE_UTF_OLD_H 0 #endif #if !defined(U_HIDE_DEPRECATED_API) && !U_HIDE_OBSOLETE_UTF_OLD_H /* Formerly utf.h, part 1 --------------------------------------------------- */ #ifdef U_USE_UTF_DEPRECATES /** * Unicode string and array offset and index type. * ICU always counts Unicode code units (UChars) for * string offsets, indexes, and lengths, not Unicode code points. * * @obsolete ICU 2.6. Use int32_t directly instead since this API will be removed in that release. */ typedef int32_t UTextOffset; #endif /** Number of bits in a Unicode string code unit - ICU uses 16-bit Unicode. @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF_SIZE 16 /** * The default choice for general Unicode string macros is to use the ..._SAFE macro implementations * with strict=false. * * @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF_SAFE /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #undef UTF_UNSAFE /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #undef UTF_STRICT /** * UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8, * which need 1 or 2 bytes in UTF-8: * \code * U+0015 = NAK = Negative Acknowledge, C0 control character * U+009f = highest C1 control character * \endcode * * These are used by UTF8_..._SAFE macros so that they can return an error value * that needs the same number of code units (bytes) as were seen by * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID(). * * @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF8_ERROR_VALUE_1 0x15 /** * See documentation on UTF8_ERROR_VALUE_1 for details. * * @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF8_ERROR_VALUE_2 0x9f /** * Error value for all UTFs. This code point value will be set by macros with error * checking if an error is detected. * * @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF_ERROR_VALUE 0xffff /** * Is a given 32-bit code an error value * as returned by one of the macros for any UTF? * * @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF_IS_ERROR(c) \ (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) /** * This is a combined macro: Is c a valid Unicode value _and_ not an error code? * * @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF_IS_VALID(c) \ (UTF_IS_UNICODE_CHAR(c) && \ (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2) /** * Is this code unit or code point a surrogate (U+d800..U+dfff)? * @deprecated ICU 2.4. Renamed to U_IS_SURROGATE and U16_IS_SURROGATE, see utf_old.h. */ #define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800) /** * Is a given 32-bit code point a Unicode noncharacter? * * @deprecated ICU 2.4. Renamed to U_IS_UNICODE_NONCHAR, see utf_old.h. */ #define UTF_IS_UNICODE_NONCHAR(c) \ ((c)>=0xfdd0 && \ ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ (uint32_t)(c)<=0x10ffff) /** * Is a given 32-bit value a Unicode code point value (0..U+10ffff) * that can be assigned a character? * * Code points that are not characters include: * - single surrogate code points (U+d800..U+dfff, 2048 code points) * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) * - the highest Unicode code point value is U+10ffff * * This means that all code points below U+d800 are character code points, * and that boundary is tested first for performance. * * @deprecated ICU 2.4. Renamed to U_IS_UNICODE_CHAR, see utf_old.h. */ #define UTF_IS_UNICODE_CHAR(c) \ ((uint32_t)(c)<0xd800 || \ ((uint32_t)(c)>0xdfff && \ (uint32_t)(c)<=0x10ffff && \ !UTF_IS_UNICODE_NONCHAR(c))) /* Formerly utf8.h ---------------------------------------------------------- */ /** * \var utf8_countTrailBytes * Internal array with numbers of trail bytes for any given byte used in * lead byte position. * * This is internal since it is not meant to be called directly by external clients; * however it is called by public macros in this file and thus must remain stable, * and should not be hidden when other internal functions are hidden (otherwise * public macros would fail to compile). * @internal */ #ifdef U_UTF8_IMPL // No forward declaration if compiling utf_impl.cpp, which defines utf8_countTrailBytes. #elif defined(U_STATIC_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) U_CAPI const uint8_t utf8_countTrailBytes[]; #else U_CFUNC U_IMPORT const uint8_t utf8_countTrailBytes[]; #endif /** * Count the trail bytes for a UTF-8 lead byte. * @deprecated ICU 2.4. Renamed to U8_COUNT_TRAIL_BYTES, see utf_old.h. */ #define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) /** * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. * @deprecated ICU 2.4. Renamed to U8_MASK_LEAD_BYTE, see utf_old.h. */ #define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) /** Is this this code point a single code unit (byte)? @deprecated ICU 2.4. Renamed to U8_IS_SINGLE, see utf_old.h. */ #define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0) /** Is this this code unit the lead code unit (byte) of a code point? @deprecated ICU 2.4. Renamed to U8_IS_LEAD, see utf_old.h. */ #define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e) /** Is this this code unit a trailing code unit (byte) of a code point? @deprecated ICU 2.4. Renamed to U8_IS_TRAIL, see utf_old.h. */ #define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80) /** Does this scalar Unicode value need multiple code units for storage? @deprecated ICU 2.4. Use U8_LENGTH or test ((uint32_t)(c)>0x7f) instead, see utf_old.h. */ #define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f) /** * Given the lead character, how many bytes are taken by this code point. * ICU does not deal with code points >0x10ffff * unless necessary for advancing in the byte stream. * * These length macros take into account that for values >0x10ffff * the UTF8_APPEND_CHAR_SAFE macros would write the error code point 0xffff * with 3 bytes. * Code point comparisons need to be in uint32_t because UChar32 * may be a signed type, and negative values must be recognized. * * @deprecated ICU 2.4. Use U8_LENGTH instead, see utf.h. */ #if 1 # define UTF8_CHAR_LENGTH(c) \ ((uint32_t)(c)<=0x7f ? 1 : \ ((uint32_t)(c)<=0x7ff ? 2 : \ ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \ ) \ ) #else # define UTF8_CHAR_LENGTH(c) \ ((uint32_t)(c)<=0x7f ? 1 : \ ((uint32_t)(c)<=0x7ff ? 2 : \ ((uint32_t)(c)<=0xffff ? 3 : \ ((uint32_t)(c)<=0x10ffff ? 4 : \ ((uint32_t)(c)<=0x3ffffff ? 5 : \ ((uint32_t)(c)<=0x7fffffff ? 6 : 3) \ ) \ ) \ ) \ ) \ ) #endif /** The maximum number of bytes per code point. @deprecated ICU 2.4. Renamed to U8_MAX_LENGTH, see utf_old.h. */ #define UTF8_MAX_CHAR_LENGTH 4 /** Average number of code units compared to UTF-16. @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF8_ARRAY_SIZE(size) ((5*(size))/2) /** @deprecated ICU 2.4. Renamed to U8_GET_UNSAFE, see utf_old.h. */ #define UTF8_GET_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ int32_t _utf8_get_char_unsafe_index=(int32_t)(i); \ UTF8_SET_CHAR_START_UNSAFE(s, _utf8_get_char_unsafe_index); \ UTF8_NEXT_CHAR_UNSAFE(s, _utf8_get_char_unsafe_index, c); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Use U8_GET instead, see utf_old.h. */ #define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) UPRV_BLOCK_MACRO_BEGIN { \ int32_t _utf8_get_char_safe_index=(int32_t)(i); \ UTF8_SET_CHAR_START_SAFE(s, start, _utf8_get_char_safe_index); \ UTF8_NEXT_CHAR_SAFE(s, _utf8_get_char_safe_index, length, c, strict); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_NEXT_UNSAFE, see utf_old.h. */ #define UTF8_NEXT_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[(i)++]; \ if((uint8_t)((c)-0xc0)<0x35) { \ uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \ UTF8_MASK_LEAD_BYTE(c, __count); \ switch(__count) { \ /* each following branch falls through to the next one */ \ case 3: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ case 2: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ case 1: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ /* no other branches to optimize switch() */ \ break; \ } \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_APPEND_UNSAFE, see utf_old.h. */ #define UTF8_APPEND_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ if((uint32_t)(c)<=0x7f) { \ (s)[(i)++]=(uint8_t)(c); \ } else { \ if((uint32_t)(c)<=0x7ff) { \ (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ } else { \ if((uint32_t)(c)<=0xffff) { \ (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ } else { \ (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ } \ (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ } \ (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_FWD_1_UNSAFE, see utf_old.h. */ #define UTF8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_FWD_N_UNSAFE, see utf_old.h. */ #define UTF8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ int32_t __N=(n); \ while(__N>0) { \ UTF8_FWD_1_UNSAFE(s, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_SET_CP_START_UNSAFE, see utf_old.h. */ #define UTF8_SET_CHAR_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ while(UTF8_IS_TRAIL((s)[i])) { --(i); } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Use U8_NEXT instead, see utf_old.h. */ #define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[(i)++]; \ if((c)>=0x80) { \ if(UTF8_IS_LEAD(c)) { \ (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict); \ } else { \ (c)=UTF8_ERROR_VALUE_1; \ } \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Use U8_APPEND instead, see utf_old.h. */ #define UTF8_APPEND_CHAR_SAFE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ if((uint32_t)(c)<=0x7f) { \ (s)[(i)++]=(uint8_t)(c); \ } else { \ (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, NULL); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_FWD_1, see utf_old.h. */ #define UTF8_FWD_1_SAFE(s, i, length) U8_FWD_1(s, i, length) /** @deprecated ICU 2.4. Renamed to U8_FWD_N, see utf_old.h. */ #define UTF8_FWD_N_SAFE(s, i, length, n) U8_FWD_N(s, i, length, n) /** @deprecated ICU 2.4. Renamed to U8_SET_CP_START, see utf_old.h. */ #define UTF8_SET_CHAR_START_SAFE(s, start, i) U8_SET_CP_START(s, start, i) /** @deprecated ICU 2.4. Renamed to U8_PREV_UNSAFE, see utf_old.h. */ #define UTF8_PREV_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[--(i)]; \ if(UTF8_IS_TRAIL(c)) { \ uint8_t __b, __count=1, __shift=6; \ \ /* c is a trail byte */ \ (c)&=0x3f; \ for(;;) { \ __b=(s)[--(i)]; \ if(__b>=0xc0) { \ UTF8_MASK_LEAD_BYTE(__b, __count); \ (c)|=(UChar32)__b<<__shift; \ break; \ } else { \ (c)|=(UChar32)(__b&0x3f)<<__shift; \ ++__count; \ __shift+=6; \ } \ } \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_BACK_1_UNSAFE, see utf_old.h. */ #define UTF8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ while(UTF8_IS_TRAIL((s)[--(i)])) {} \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_BACK_N_UNSAFE, see utf_old.h. */ #define UTF8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ int32_t __N=(n); \ while(__N>0) { \ UTF8_BACK_1_UNSAFE(s, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ #define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ UTF8_BACK_1_UNSAFE(s, i); \ UTF8_FWD_1_UNSAFE(s, i); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Use U8_PREV instead, see utf_old.h. */ #define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[--(i)]; \ if((c)>=0x80) { \ if((c)<=0xbf) { \ (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \ } else { \ (c)=UTF8_ERROR_VALUE_1; \ } \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U8_BACK_1, see utf_old.h. */ #define UTF8_BACK_1_SAFE(s, start, i) U8_BACK_1(s, start, i) /** @deprecated ICU 2.4. Renamed to U8_BACK_N, see utf_old.h. */ #define UTF8_BACK_N_SAFE(s, start, i, n) U8_BACK_N(s, start, i, n) /** @deprecated ICU 2.4. Renamed to U8_SET_CP_LIMIT, see utf_old.h. */ #define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) U8_SET_CP_LIMIT(s, start, i, length) /* Formerly utf16.h --------------------------------------------------------- */ /** Is uchar a first/lead surrogate? @deprecated ICU 2.4. Renamed to U_IS_LEAD and U16_IS_LEAD, see utf_old.h. */ #define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800) /** Is uchar a second/trail surrogate? @deprecated ICU 2.4. Renamed to U_IS_TRAIL and U16_IS_TRAIL, see utf_old.h. */ #define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00) /** Assuming c is a surrogate, is it a first/lead surrogate? @deprecated ICU 2.4. Renamed to U_IS_SURROGATE_LEAD and U16_IS_SURROGATE_LEAD, see utf_old.h. */ #define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0) /** Helper constant for UTF16_GET_PAIR_VALUE. @deprecated ICU 2.4. Renamed to U16_SURROGATE_OFFSET, see utf_old.h. */ #define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) /** Get the UTF-32 value from the surrogate code units. @deprecated ICU 2.4. Renamed to U16_GET_SUPPLEMENTARY, see utf_old.h. */ #define UTF16_GET_PAIR_VALUE(first, second) \ (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) /** @deprecated ICU 2.4. Renamed to U16_LEAD, see utf_old.h. */ #define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) /** @deprecated ICU 2.4. Renamed to U16_TRAIL, see utf_old.h. */ #define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) /** @deprecated ICU 2.4. Renamed to U16_LEAD, see utf_old.h. */ #define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary) /** @deprecated ICU 2.4. Renamed to U16_TRAIL, see utf_old.h. */ #define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary) /** @deprecated ICU 2.4. Renamed to U16_IS_SINGLE, see utf_old.h. */ #define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar) /** @deprecated ICU 2.4. Renamed to U16_IS_LEAD, see utf_old.h. */ #define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar) /** @deprecated ICU 2.4. Renamed to U16_IS_TRAIL, see utf_old.h. */ #define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar) /** Does this scalar Unicode value need multiple code units for storage? @deprecated ICU 2.4. Use U16_LENGTH or test ((uint32_t)(c)>0xffff) instead, see utf_old.h. */ #define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff) /** @deprecated ICU 2.4. Renamed to U16_LENGTH, see utf_old.h. */ #define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) /** @deprecated ICU 2.4. Renamed to U16_MAX_LENGTH, see utf_old.h. */ #define UTF16_MAX_CHAR_LENGTH 2 /** Average number of code units compared to UTF-16. @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF16_ARRAY_SIZE(size) (size) /** * Get a single code point from an offset that points to any * of the code units that belong to that code point. * Assume 0<=i=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ } else if(strict) {\ /* unmatched second surrogate */ \ (c)=UTF_ERROR_VALUE; \ } \ } \ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ (c)=UTF_ERROR_VALUE; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_NEXT_UNSAFE, see utf_old.h. */ #define UTF16_NEXT_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[(i)++]; \ if(UTF_IS_FIRST_SURROGATE(c)) { \ (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_APPEND_UNSAFE, see utf_old.h. */ #define UTF16_APPEND_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ if((uint32_t)(c)<=0xffff) { \ (s)[(i)++]=(uint16_t)(c); \ } else { \ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_FWD_1_UNSAFE, see utf_old.h. */ #define UTF16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \ ++(i); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_FWD_N_UNSAFE, see utf_old.h. */ #define UTF16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ int32_t __N=(n); \ while(__N>0) { \ UTF16_FWD_1_UNSAFE(s, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_SET_CP_START_UNSAFE, see utf_old.h. */ #define UTF16_SET_CHAR_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ if(UTF_IS_SECOND_SURROGATE((s)[i])) { \ --(i); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Use U16_NEXT instead, see utf_old.h. */ #define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[(i)++]; \ if(UTF_IS_FIRST_SURROGATE(c)) { \ uint16_t __c2; \ if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \ ++(i); \ (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ } else if(strict) {\ /* unmatched first surrogate */ \ (c)=UTF_ERROR_VALUE; \ } \ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ /* unmatched second surrogate or other non-character */ \ (c)=UTF_ERROR_VALUE; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Use U16_APPEND instead, see utf_old.h. */ #define UTF16_APPEND_CHAR_SAFE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ if((uint32_t)(c)<=0xffff) { \ (s)[(i)++]=(uint16_t)(c); \ } else if((uint32_t)(c)<=0x10ffff) { \ if((i)+1<(length)) { \ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ } else /* not enough space */ { \ (s)[(i)++]=UTF_ERROR_VALUE; \ } \ } else /* c>0x10ffff, write error value */ { \ (s)[(i)++]=UTF_ERROR_VALUE; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. */ #define UTF16_FWD_1_SAFE(s, i, length) U16_FWD_1(s, i, length) /** @deprecated ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. */ #define UTF16_FWD_N_SAFE(s, i, length, n) U16_FWD_N(s, i, length, n) /** @deprecated ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. */ #define UTF16_SET_CHAR_START_SAFE(s, start, i) U16_SET_CP_START(s, start, i) /** @deprecated ICU 2.4. Renamed to U16_PREV_UNSAFE, see utf_old.h. */ #define UTF16_PREV_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[--(i)]; \ if(UTF_IS_SECOND_SURROGATE(c)) { \ (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_BACK_1_UNSAFE, see utf_old.h. */ #define UTF16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \ --(i); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_BACK_N_UNSAFE, see utf_old.h. */ #define UTF16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ int32_t __N=(n); \ while(__N>0) { \ UTF16_BACK_1_UNSAFE(s, i); \ --__N; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ #define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ ++(i); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Use U16_PREV instead, see utf_old.h. */ #define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[--(i)]; \ if(UTF_IS_SECOND_SURROGATE(c)) { \ uint16_t __c2; \ if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ --(i); \ (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ } else if(strict) {\ /* unmatched second surrogate */ \ (c)=UTF_ERROR_VALUE; \ } \ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ /* unmatched first surrogate or other non-character */ \ (c)=UTF_ERROR_VALUE; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. */ #define UTF16_BACK_1_SAFE(s, start, i) U16_BACK_1(s, start, i) /** @deprecated ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. */ #define UTF16_BACK_N_SAFE(s, start, i, n) U16_BACK_N(s, start, i, n) /** @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. */ #define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length) /* Formerly utf32.h --------------------------------------------------------- */ /* * Old documentation: * * This file defines macros to deal with UTF-32 code units and code points. * Signatures and semantics are the same as for the similarly named macros * in utf16.h. * utf32.h is included by utf.h after unicode/umachine.h

* and some common definitions. *

Usage: ICU coding guidelines for if() statements should be followed when using these macros. * Compound statements (curly braces {}) must be used for if-else-while... * bodies and all macro statements should be terminated with semicolon.

*/ /* internal definitions ----------------------------------------------------- */ /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_IS_SAFE(c, strict) \ (!(strict) ? \ (uint32_t)(c)<=0x10ffff : \ UTF_IS_UNICODE_CHAR(c)) /* * For the semantics of all of these macros, see utf16.h. * The UTF-32 versions are trivial because any code point is * encoded using exactly one code unit. */ /* single-code point definitions -------------------------------------------- */ /* classes of code unit values */ /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_IS_SINGLE(uchar) 1 /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_IS_LEAD(uchar) 0 /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_IS_TRAIL(uchar) 0 /* number of code units per code point */ /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_NEED_MULTIPLE_UCHAR(c) 0 /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_CHAR_LENGTH(c) 1 /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_MAX_CHAR_LENGTH 1 /* average number of code units compared to UTF-16 */ /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_ARRAY_SIZE(size) (size) /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_GET_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[i]; \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[i]; \ if(!UTF32_IS_SAFE(c, strict)) { \ (c)=UTF_ERROR_VALUE; \ } \ } UPRV_BLOCK_MACRO_END /* definitions with forward iteration --------------------------------------- */ /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_NEXT_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[(i)++]; \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_APPEND_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (s)[(i)++]=(c); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ ++(i); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ (i)+=(n); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_SET_CHAR_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[(i)++]; \ if(!UTF32_IS_SAFE(c, strict)) { \ (c)=UTF_ERROR_VALUE; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_APPEND_CHAR_SAFE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ if((uint32_t)(c)<=0x10ffff) { \ (s)[(i)++]=(c); \ } else /* c>0x10ffff, write 0xfffd */ { \ (s)[(i)++]=0xfffd; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_FWD_1_SAFE(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \ ++(i); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_FWD_N_SAFE(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \ if(((i)+=(n))>(length)) { \ (i)=(length); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_SET_CHAR_START_SAFE(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ } UPRV_BLOCK_MACRO_END /* definitions with backward iteration -------------------------------------- */ /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_PREV_CHAR_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[--(i)]; \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ --(i); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ (i)-=(n); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) UPRV_BLOCK_MACRO_BEGIN { \ (c)=(s)[--(i)]; \ if(!UTF32_IS_SAFE(c, strict)) { \ (c)=UTF_ERROR_VALUE; \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_BACK_1_SAFE(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ --(i); \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_BACK_N_SAFE(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \ (i)-=(n); \ if((i)<(start)) { \ (i)=(start); \ } \ } UPRV_BLOCK_MACRO_END /** @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \ } UPRV_BLOCK_MACRO_END /* Formerly utf.h, part 2 --------------------------------------------------- */ /** * Estimate the number of code units for a string based on the number of UTF-16 code units. * * @deprecated ICU 2.4. Obsolete, see utf_old.h. */ #define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) /** @deprecated ICU 2.4. Renamed to U16_GET_UNSAFE, see utf_old.h. */ #define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) /** @deprecated ICU 2.4. Use U16_GET instead, see utf_old.h. */ #define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) /** @deprecated ICU 2.4. Renamed to U16_NEXT_UNSAFE, see utf_old.h. */ #define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) /** @deprecated ICU 2.4. Use U16_NEXT instead, see utf_old.h. */ #define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) /** @deprecated ICU 2.4. Renamed to U16_APPEND_UNSAFE, see utf_old.h. */ #define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c) /** @deprecated ICU 2.4. Use U16_APPEND instead, see utf_old.h. */ #define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) /** @deprecated ICU 2.4. Renamed to U16_FWD_1_UNSAFE, see utf_old.h. */ #define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i) /** @deprecated ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. */ #define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length) /** @deprecated ICU 2.4. Renamed to U16_FWD_N_UNSAFE, see utf_old.h. */ #define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n) /** @deprecated ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. */ #define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) /** @deprecated ICU 2.4. Renamed to U16_SET_CP_START_UNSAFE, see utf_old.h. */ #define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) /** @deprecated ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. */ #define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) /** @deprecated ICU 2.4. Renamed to U16_PREV_UNSAFE, see utf_old.h. */ #define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) /** @deprecated ICU 2.4. Use U16_PREV instead, see utf_old.h. */ #define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) /** @deprecated ICU 2.4. Renamed to U16_BACK_1_UNSAFE, see utf_old.h. */ #define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) /** @deprecated ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. */ #define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) /** @deprecated ICU 2.4. Renamed to U16_BACK_N_UNSAFE, see utf_old.h. */ #define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) /** @deprecated ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. */ #define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) /** @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ #define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) /** @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. */ #define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) /* Define default macros (UTF-16 "safe") ------------------------------------ */ /** * Does this code unit alone encode a code point (BMP, not a surrogate)? * Same as UTF16_IS_SINGLE. * @deprecated ICU 2.4. Renamed to U_IS_SINGLE and U16_IS_SINGLE, see utf_old.h. */ #define UTF_IS_SINGLE(uchar) U16_IS_SINGLE(uchar) /** * Is this code unit the first one of several (a lead surrogate)? * Same as UTF16_IS_LEAD. * @deprecated ICU 2.4. Renamed to U_IS_LEAD and U16_IS_LEAD, see utf_old.h. */ #define UTF_IS_LEAD(uchar) U16_IS_LEAD(uchar) /** * Is this code unit one of several but not the first one (a trail surrogate)? * Same as UTF16_IS_TRAIL. * @deprecated ICU 2.4. Renamed to U_IS_TRAIL and U16_IS_TRAIL, see utf_old.h. */ #define UTF_IS_TRAIL(uchar) U16_IS_TRAIL(uchar) /** * Does this code point require multiple code units (is it a supplementary code point)? * Same as UTF16_NEED_MULTIPLE_UCHAR. * @deprecated ICU 2.4. Use U16_LENGTH or test ((uint32_t)(c)>0xffff) instead. */ #define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c) /** * How many code units are used to encode this code point (1 or 2)? * Same as UTF16_CHAR_LENGTH. * @deprecated ICU 2.4. Renamed to U16_LENGTH, see utf_old.h. */ #define UTF_CHAR_LENGTH(c) U16_LENGTH(c) /** * How many code units are used at most for any Unicode code point (2)? * Same as UTF16_MAX_CHAR_LENGTH. * @deprecated ICU 2.4. Renamed to U16_MAX_LENGTH, see utf_old.h. */ #define UTF_MAX_CHAR_LENGTH U16_MAX_LENGTH /** * Set c to the code point that contains the code unit i. * i could point to the lead or the trail surrogate for the code point. * i is not modified. * Same as UTF16_GET_CHAR. * \pre 0<=i * 12/07/99 helena Moved copyright notice string from ucnv_bld.h here. ******************************************************************************* */ #ifndef UTYPES_H #define UTYPES_H #include /*! * \file * \brief Basic definitions for ICU, for both C and C++ APIs * * This file defines basic types, constants, and enumerations directly or * indirectly by including other header files, especially utf.h for the * basic character and string definitions and umachine.h for consistent * integer and other types. */ /** * \def U_SHOW_CPLUSPLUS_API * @internal */ #ifdef __cplusplus # ifndef U_SHOW_CPLUSPLUS_API # endif #else # undef U_SHOW_CPLUSPLUS_API #endif /** @{ API visibility control */ /** * \def U_HIDE_DRAFT_API * Define this to 1 to request that draft API be "hidden" * @internal */ /** * \def U_HIDE_INTERNAL_API * Define this to 1 to request that internal API be "hidden" * @internal */ #if !U_DEFAULT_SHOW_DRAFT && !defined(U_SHOW_DRAFT_API) #endif #if !U_DEFAULT_SHOW_DRAFT && !defined(U_SHOW_INTERNAL_API) #endif /** @} */ /** * \def NULL * Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C. * @stable ICU 2.0 */ #ifndef NULL #ifdef __cplusplus #define NULL nullptr #else #define NULL ((void *)0) #endif #endif /*===========================================================================*/ /* Calendar/TimeZone data types */ /*===========================================================================*/ /** * Date and Time data type. * This is a primitive data type that holds the date and time * as the number of milliseconds since 1970-jan-01, 00:00 UTC. * UTC leap seconds are ignored. * @stable ICU 2.0 */ typedef double UDate; /** The number of milliseconds per second @stable ICU 2.0 */ #define U_MILLIS_PER_SECOND (1000) /** The number of milliseconds per minute @stable ICU 2.0 */ #define U_MILLIS_PER_MINUTE (60000) /** The number of milliseconds per hour @stable ICU 2.0 */ #define U_MILLIS_PER_HOUR (3600000) /** The number of milliseconds per day @stable ICU 2.0 */ #define U_MILLIS_PER_DAY (86400000) /** * Maximum UDate value * @stable ICU 4.8 */ #define U_DATE_MAX DBL_MAX /** * Minimum UDate value * @stable ICU 4.8 */ #define U_DATE_MIN -U_DATE_MAX /*===========================================================================*/ /* Shared library/DLL import-export API control */ /*===========================================================================*/ /* * Control of symbol import/export. * ICU is separated into three libraries. */ /** * \def U_COMBINED_IMPLEMENTATION * Set to export library symbols from inside the ICU library * when all of ICU is in a single library. * This can be set as a compiler option while building ICU, and it * needs to be the first one tested to override U_COMMON_API, U_I18N_API, etc. * @stable ICU 2.0 */ /** * \def U_DATA_API * Set to export library symbols from inside the stubdata library, * and to import them from outside. * @stable ICU 3.0 */ /** * \def U_COMMON_API * Set to export library symbols from inside the common library, * and to import them from outside. * @stable ICU 2.0 */ /** * \def U_I18N_API * Set to export library symbols from inside the i18n library, * and to import them from outside. * @stable ICU 2.0 */ /** * \def U_LAYOUT_API * Set to export library symbols from inside the layout engine library, * and to import them from outside. * @stable ICU 2.0 */ /** * \def U_LAYOUTEX_API * Set to export library symbols from inside the layout extensions library, * and to import them from outside. * @stable ICU 2.6 */ /** * \def U_IO_API * Set to export library symbols from inside the ustdio library, * and to import them from outside. * @stable ICU 2.0 */ /** * \def U_TOOLUTIL_API * Set to export library symbols from inside the toolutil library, * and to import them from outside. * @stable ICU 3.4 */ #ifdef U_IN_DOXYGEN // This definition is required when generating the API docs. #define U_COMBINED_IMPLEMENTATION 1 #endif #if defined(U_COMBINED_IMPLEMENTATION) #define U_DATA_API U_EXPORT #define U_COMMON_API U_EXPORT #define U_I18N_API U_EXPORT #define U_LAYOUT_API U_EXPORT #define U_LAYOUTEX_API U_EXPORT #define U_IO_API U_EXPORT #define U_TOOLUTIL_API U_EXPORT #elif defined(U_STATIC_IMPLEMENTATION) #define U_DATA_API #define U_COMMON_API #define U_I18N_API #define U_LAYOUT_API #define U_LAYOUTEX_API #define U_IO_API #define U_TOOLUTIL_API #elif defined(U_COMMON_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_EXPORT #define U_I18N_API U_IMPORT #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT #define U_TOOLUTIL_API U_IMPORT #elif defined(U_I18N_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT #define U_I18N_API U_EXPORT #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT #define U_TOOLUTIL_API U_IMPORT #elif defined(U_LAYOUT_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT #define U_I18N_API U_IMPORT #define U_LAYOUT_API U_EXPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT #define U_TOOLUTIL_API U_IMPORT #elif defined(U_LAYOUTEX_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT #define U_I18N_API U_IMPORT #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_EXPORT #define U_IO_API U_IMPORT #define U_TOOLUTIL_API U_IMPORT #elif defined(U_IO_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT #define U_I18N_API U_IMPORT #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_EXPORT #define U_TOOLUTIL_API U_IMPORT #elif defined(U_TOOLUTIL_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT #define U_I18N_API U_IMPORT #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT #define U_TOOLUTIL_API U_EXPORT #else #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT #define U_I18N_API U_IMPORT #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT #define U_TOOLUTIL_API U_IMPORT #endif /** * \def U_STANDARD_CPP_NAMESPACE * Control of C++ Namespace * @stable ICU 2.0 */ #ifdef __cplusplus #define U_STANDARD_CPP_NAMESPACE :: #else #define U_STANDARD_CPP_NAMESPACE #endif /*===========================================================================*/ /* UErrorCode */ /*===========================================================================*/ /** * Standard ICU4C error code type, a substitute for exceptions. * * Initialize the UErrorCode with U_ZERO_ERROR, and check for success or * failure using U_SUCCESS() or U_FAILURE(): * * UErrorCode errorCode = U_ZERO_ERROR; * // call ICU API that needs an error code parameter. * if (U_FAILURE(errorCode)) { * // An error occurred. Handle it here. * } * * C++ code should use icu::ErrorCode, available in unicode/errorcode.h, or a * suitable subclass. * * For more information, see: * https://unicode-org.github.io/icu/userguide/dev/codingguidelines#details-about-icu-error-codes * * Note: By convention, ICU functions that take a reference (C++) or a pointer * (C) to a UErrorCode first test: * * if (U_FAILURE(errorCode)) { return immediately; } * * so that in a chain of such functions the first one that sets an error code * causes the following ones to not perform any operations. * * @stable ICU 2.0 */ typedef enum UErrorCode { /* The ordering of U_ERROR_INFO_START Vs U_USING_FALLBACK_WARNING looks weird * and is that way because VC++ debugger displays first encountered constant, * which is not the what the code is used for */ U_USING_FALLBACK_WARNING = -128, /**< A resource bundle lookup returned a fallback result (not an error) */ U_ERROR_WARNING_START = -128, /**< Start of information results (semantically successful) */ U_USING_DEFAULT_WARNING = -127, /**< A resource bundle lookup returned a result from the root locale (not an error) */ U_SAFECLONE_ALLOCATED_WARNING = -126, /**< A SafeClone operation required allocating memory (informational only) */ U_STATE_OLD_WARNING = -125, /**< ICU has to use compatibility layer to construct the service. Expect performance/memory usage degradation. Consider upgrading */ U_STRING_NOT_TERMINATED_WARNING = -124,/**< An output string could not be NUL-terminated because output length==destCapacity. */ U_SORT_KEY_TOO_SHORT_WARNING = -123, /**< Number of levels requested in getBound is higher than the number of levels in the sort key */ U_AMBIGUOUS_ALIAS_WARNING = -122, /**< This converter alias can go to different converter implementations */ U_DIFFERENT_UCA_VERSION = -121, /**< ucol_open encountered a mismatch between UCA version and collator image version, so the collator was constructed from rules. No impact to further function */ U_PLUGIN_CHANGED_LEVEL_WARNING = -120, /**< A plugin caused a level change. May not be an error, but later plugins may not load. */ U_ZERO_ERROR = 0, /**< No error, no warning. */ U_ILLEGAL_ARGUMENT_ERROR = 1, /**< Start of codes indicating failure */ U_MISSING_RESOURCE_ERROR = 2, /**< The requested resource cannot be found */ U_INVALID_FORMAT_ERROR = 3, /**< Data format is not what is expected */ U_FILE_ACCESS_ERROR = 4, /**< The requested file cannot be found */ U_INTERNAL_PROGRAM_ERROR = 5, /**< Indicates a bug in the library code */ U_MESSAGE_PARSE_ERROR = 6, /**< Unable to parse a message (message format) */ U_MEMORY_ALLOCATION_ERROR = 7, /**< Memory allocation error */ U_INDEX_OUTOFBOUNDS_ERROR = 8, /**< Trying to access the index that is out of bounds */ U_PARSE_ERROR = 9, /**< Equivalent to Java ParseException */ U_INVALID_CHAR_FOUND = 10, /**< Character conversion: Unmappable input sequence. In other APIs: Invalid character. */ U_TRUNCATED_CHAR_FOUND = 11, /**< Character conversion: Incomplete input sequence. */ U_ILLEGAL_CHAR_FOUND = 12, /**< Character conversion: Illegal input sequence/combination of input units. */ U_INVALID_TABLE_FORMAT = 13, /**< Conversion table file found, but corrupted */ U_INVALID_TABLE_FILE = 14, /**< Conversion table file not found */ U_BUFFER_OVERFLOW_ERROR = 15, /**< A result would not fit in the supplied buffer */ U_UNSUPPORTED_ERROR = 16, /**< Requested operation not supported in current context */ U_RESOURCE_TYPE_MISMATCH = 17, /**< an operation is requested over a resource that does not support it */ U_ILLEGAL_ESCAPE_SEQUENCE = 18, /**< ISO-2022 illegal escape sequence */ U_UNSUPPORTED_ESCAPE_SEQUENCE = 19, /**< ISO-2022 unsupported escape sequence */ U_NO_SPACE_AVAILABLE = 20, /**< No space available for in-buffer expansion for Arabic shaping */ U_CE_NOT_FOUND_ERROR = 21, /**< Currently used only while setting variable top, but can be used generally */ U_PRIMARY_TOO_LONG_ERROR = 22, /**< User tried to set variable top to a primary that is longer than two bytes */ U_STATE_TOO_OLD_ERROR = 23, /**< ICU cannot construct a service from this state, as it is no longer supported */ U_TOO_MANY_ALIASES_ERROR = 24, /**< There are too many aliases in the path to the requested resource. It is very possible that a circular alias definition has occurred */ U_ENUM_OUT_OF_SYNC_ERROR = 25, /**< UEnumeration out of sync with underlying collection */ U_INVARIANT_CONVERSION_ERROR = 26, /**< Unable to convert a UChar* string to char* with the invariant converter. */ U_INVALID_STATE_ERROR = 27, /**< Requested operation can not be completed with ICU in its current state */ U_COLLATOR_VERSION_MISMATCH = 28, /**< Collator version is not compatible with the base version */ U_USELESS_COLLATOR_ERROR = 29, /**< Collator is options only and no base is specified */ U_NO_WRITE_PERMISSION = 30, /**< Attempt to modify read-only or constant data. */ #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * The input is impractically long for an operation. * It is rejected because it may lead to problems such as excessive * processing time, stack depth, or heap memory requirements. * * @stable ICU 68 */ U_INPUT_TOO_LONG_ERROR = 31, #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /* * Error codes in the range 0x10000 0x10100 are reserved for Transliterator. */ U_BAD_VARIABLE_DEFINITION=0x10000,/**< Missing '$' or duplicate variable name */ U_PARSE_ERROR_START = 0x10000, /**< Start of Transliterator errors */ U_MALFORMED_RULE, /**< Elements of a rule are misplaced */ U_MALFORMED_SET, /**< A UnicodeSet pattern is invalid*/ U_MALFORMED_SYMBOL_REFERENCE, /**< UNUSED as of ICU 2.4 */ U_MALFORMED_UNICODE_ESCAPE, /**< A Unicode escape pattern is invalid*/ U_MALFORMED_VARIABLE_DEFINITION, /**< A variable definition is invalid */ U_MALFORMED_VARIABLE_REFERENCE, /**< A variable reference is invalid */ U_MISMATCHED_SEGMENT_DELIMITERS, /**< UNUSED as of ICU 2.4 */ U_MISPLACED_ANCHOR_START, /**< A start anchor appears at an illegal position */ U_MISPLACED_CURSOR_OFFSET, /**< A cursor offset occurs at an illegal position */ U_MISPLACED_QUANTIFIER, /**< A quantifier appears after a segment close delimiter */ U_MISSING_OPERATOR, /**< A rule contains no operator */ U_MISSING_SEGMENT_CLOSE, /**< UNUSED as of ICU 2.4 */ U_MULTIPLE_ANTE_CONTEXTS, /**< More than one ante context */ U_MULTIPLE_CURSORS, /**< More than one cursor */ U_MULTIPLE_POST_CONTEXTS, /**< More than one post context */ U_TRAILING_BACKSLASH, /**< A dangling backslash */ U_UNDEFINED_SEGMENT_REFERENCE, /**< A segment reference does not correspond to a defined segment */ U_UNDEFINED_VARIABLE, /**< A variable reference does not correspond to a defined variable */ U_UNQUOTED_SPECIAL, /**< A special character was not quoted or escaped */ U_UNTERMINATED_QUOTE, /**< A closing single quote is missing */ U_RULE_MASK_ERROR, /**< A rule is hidden by an earlier more general rule */ U_MISPLACED_COMPOUND_FILTER, /**< A compound filter is in an invalid location */ U_MULTIPLE_COMPOUND_FILTERS, /**< More than one compound filter */ U_INVALID_RBT_SYNTAX, /**< A "::id" rule was passed to the RuleBasedTransliterator parser */ U_INVALID_PROPERTY_PATTERN, /**< UNUSED as of ICU 2.4 */ U_MALFORMED_PRAGMA, /**< A 'use' pragma is invalid */ U_UNCLOSED_SEGMENT, /**< A closing ')' is missing */ U_ILLEGAL_CHAR_IN_SEGMENT, /**< UNUSED as of ICU 2.4 */ U_VARIABLE_RANGE_EXHAUSTED, /**< Too many stand-ins generated for the given variable range */ U_VARIABLE_RANGE_OVERLAP, /**< The variable range overlaps characters used in rules */ U_ILLEGAL_CHARACTER, /**< A special character is outside its allowed context */ U_INTERNAL_TRANSLITERATOR_ERROR, /**< Internal transliterator system error */ U_INVALID_ID, /**< A "::id" rule specifies an unknown transliterator */ U_INVALID_FUNCTION, /**< A "&fn()" rule specifies an unknown transliterator */ /* * Error codes in the range 0x10100 0x10200 are reserved for the formatting API. */ U_UNEXPECTED_TOKEN=0x10100, /**< Syntax error in format pattern */ U_FMT_PARSE_ERROR_START=0x10100, /**< Start of format library errors */ U_MULTIPLE_DECIMAL_SEPARATORS, /**< More than one decimal separator in number pattern */ U_MULTIPLE_DECIMAL_SEPERATORS = U_MULTIPLE_DECIMAL_SEPARATORS, /**< Typo: kept for backward compatibility. Use U_MULTIPLE_DECIMAL_SEPARATORS */ U_MULTIPLE_EXPONENTIAL_SYMBOLS, /**< More than one exponent symbol in number pattern */ U_MALFORMED_EXPONENTIAL_PATTERN, /**< Grouping symbol in exponent pattern */ U_MULTIPLE_PERCENT_SYMBOLS, /**< More than one percent symbol in number pattern */ U_MULTIPLE_PERMILL_SYMBOLS, /**< More than one permill symbol in number pattern */ U_MULTIPLE_PAD_SPECIFIERS, /**< More than one pad symbol in number pattern */ U_PATTERN_SYNTAX_ERROR, /**< Syntax error in format pattern */ U_ILLEGAL_PAD_POSITION, /**< Pad symbol misplaced in number pattern */ U_UNMATCHED_BRACES, /**< Braces do not match in message pattern */ U_UNSUPPORTED_PROPERTY, /**< UNUSED as of ICU 2.4 */ U_UNSUPPORTED_ATTRIBUTE, /**< UNUSED as of ICU 2.4 */ U_ARGUMENT_TYPE_MISMATCH, /**< Argument name and argument index mismatch in MessageFormat functions */ U_DUPLICATE_KEYWORD, /**< Duplicate keyword in PluralFormat */ U_UNDEFINED_KEYWORD, /**< Undefined Plural keyword */ U_DEFAULT_KEYWORD_MISSING, /**< Missing DEFAULT rule in plural rules */ U_DECIMAL_NUMBER_SYNTAX_ERROR, /**< Decimal number syntax error */ U_FORMAT_INEXACT_ERROR, /**< Cannot format a number exactly and rounding mode is ROUND_UNNECESSARY @stable ICU 4.8 */ #if (NTDDI_VERSION >= NTDDI_WIN10_VB) U_NUMBER_ARG_OUTOFBOUNDS_ERROR, /**< The argument to a NumberFormatter helper method was out of bounds; the bounds are usually 0 to 999. @stable ICU 61 */ U_NUMBER_SKELETON_SYNTAX_ERROR, /**< The number skeleton passed to C++ NumberFormatter or C UNumberFormatter was invalid or contained a syntax error. @stable ICU 62 */ #endif // (NTDDI_VERSION >= NTDDI_WIN10_VB) /* * Error codes in the range 0x10200 0x102ff are reserved for BreakIterator. */ U_BRK_INTERNAL_ERROR=0x10200, /**< An internal error (bug) was detected. */ U_BRK_ERROR_START=0x10200, /**< Start of codes indicating Break Iterator failures */ U_BRK_HEX_DIGITS_EXPECTED, /**< Hex digits expected as part of a escaped char in a rule. */ U_BRK_SEMICOLON_EXPECTED, /**< Missing ';' at the end of a RBBI rule. */ U_BRK_RULE_SYNTAX, /**< Syntax error in RBBI rule. */ U_BRK_UNCLOSED_SET, /**< UnicodeSet writing an RBBI rule missing a closing ']'. */ U_BRK_ASSIGN_ERROR, /**< Syntax error in RBBI rule assignment statement. */ U_BRK_VARIABLE_REDFINITION, /**< RBBI rule $Variable redefined. */ U_BRK_MISMATCHED_PAREN, /**< Mis-matched parentheses in an RBBI rule. */ U_BRK_NEW_LINE_IN_QUOTED_STRING, /**< Missing closing quote in an RBBI rule. */ U_BRK_UNDEFINED_VARIABLE, /**< Use of an undefined $Variable in an RBBI rule. */ U_BRK_INIT_ERROR, /**< Initialization failure. Probable missing ICU Data. */ U_BRK_RULE_EMPTY_SET, /**< Rule contains an empty Unicode Set. */ U_BRK_UNRECOGNIZED_OPTION, /**< !!option in RBBI rules not recognized. */ U_BRK_MALFORMED_RULE_TAG, /**< The {nnn} tag on a rule is malformed */ /* * Error codes in the range 0x10300-0x103ff are reserved for regular expression related errors. */ U_REGEX_INTERNAL_ERROR=0x10300, /**< An internal error (bug) was detected. */ U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */ U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */ U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */ U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */ U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */ U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */ U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */ U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */ U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */ U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */ U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */ U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */ U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */ U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/ U_REGEX_MISSING_CLOSE_BRACKET=U_REGEX_SET_CONTAINS_STRING+2, /**< Missing closing bracket on a bracket expression. */ U_REGEX_INVALID_RANGE, /**< In a character range [x-y], x is greater than y. */ U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack stack overflow. */ U_REGEX_TIME_OUT, /**< Maximum allowed match time exceeded */ U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by user callback fn. */ U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */ U_REGEX_INVALID_CAPTURE_GROUP_NAME, /**< Invalid capture group name. @stable ICU 55 */ /* * Error codes in the range 0x10400-0x104ff are reserved for IDNA related error codes. */ U_IDNA_PROHIBITED_ERROR=0x10400, U_IDNA_ERROR_START=0x10400, U_IDNA_UNASSIGNED_ERROR, U_IDNA_CHECK_BIDI_ERROR, U_IDNA_STD3_ASCII_RULES_ERROR, U_IDNA_ACE_PREFIX_ERROR, U_IDNA_VERIFICATION_ERROR, U_IDNA_LABEL_TOO_LONG_ERROR, U_IDNA_ZERO_LENGTH_LABEL_ERROR, U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR, /* * Aliases for StringPrep */ U_STRINGPREP_PROHIBITED_ERROR = U_IDNA_PROHIBITED_ERROR, U_STRINGPREP_UNASSIGNED_ERROR = U_IDNA_UNASSIGNED_ERROR, U_STRINGPREP_CHECK_BIDI_ERROR = U_IDNA_CHECK_BIDI_ERROR, /* * Error codes in the range 0x10500-0x105ff are reserved for Plugin related error codes. */ U_PLUGIN_ERROR_START=0x10500, /**< Start of codes indicating plugin failures */ U_PLUGIN_TOO_HIGH=0x10500, /**< The plugin's level is too high to be loaded right now. */ U_PLUGIN_DIDNT_SET_LEVEL, /**< The plugin didn't call uplug_setPlugLevel in response to a QUERY */ } UErrorCode; /* Use the following to determine if an UErrorCode represents */ /* operational success or failure. */ #ifdef __cplusplus /** * Does the error code indicate success? * @stable ICU 2.0 */ static inline UBool U_SUCCESS(UErrorCode code) { return (UBool)(code<=U_ZERO_ERROR); } /** * Does the error code indicate a failure? * @stable ICU 2.0 */ static inline UBool U_FAILURE(UErrorCode code) { return (UBool)(code>U_ZERO_ERROR); } #else /** * Does the error code indicate success? * @stable ICU 2.0 */ # define U_SUCCESS(x) ((x)<=U_ZERO_ERROR) /** * Does the error code indicate a failure? * @stable ICU 2.0 */ # define U_FAILURE(x) ((x)>U_ZERO_ERROR) #endif /** * Return a string for a UErrorCode value. * The string will be the same as the name of the error code constant * in the UErrorCode enum above. * @stable ICU 2.0 */ U_CAPI const char * U_EXPORT2 u_errorName(UErrorCode code); #endif /* _UTYPES */ // utrace.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2003-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: utrace.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2003aug06 * created by: Markus W. Scherer * * Definitions for ICU tracing/logging. * */ #ifndef __UTRACE_H__ #define __UTRACE_H__ #include /** * \file * \brief C API: Definitions for ICU tracing/logging. * * This provides API for debugging the internals of ICU without the use of * a traditional debugger. * * By default, tracing is disabled in ICU. If you need to debug ICU with * tracing, please compile ICU with the --enable-tracing configure option. */ U_CDECL_BEGIN /** * Trace severity levels. Higher levels increase the verbosity of the trace output. * @see utrace_setLevel * @stable ICU 2.8 */ typedef enum UTraceLevel { /** Disable all tracing @stable ICU 2.8*/ UTRACE_OFF=-1, /** Trace error conditions only @stable ICU 2.8*/ UTRACE_ERROR=0, /** Trace errors and warnings @stable ICU 2.8*/ UTRACE_WARNING=3, /** Trace opens and closes of ICU services @stable ICU 2.8*/ UTRACE_OPEN_CLOSE=5, /** Trace an intermediate number of ICU operations @stable ICU 2.8*/ UTRACE_INFO=7, /** Trace the maximum number of ICU operations @stable ICU 2.8*/ UTRACE_VERBOSE=9 } UTraceLevel; /** * These are the ICU functions that will be traced when tracing is enabled. * @stable ICU 2.8 */ typedef enum UTraceFunctionNumber { UTRACE_FUNCTION_START=0, UTRACE_U_INIT=UTRACE_FUNCTION_START, UTRACE_U_CLEANUP, UTRACE_CONVERSION_START=0x1000, UTRACE_UCNV_OPEN=UTRACE_CONVERSION_START, UTRACE_UCNV_OPEN_PACKAGE, UTRACE_UCNV_OPEN_ALGORITHMIC, UTRACE_UCNV_CLONE, UTRACE_UCNV_CLOSE, UTRACE_UCNV_FLUSH_CACHE, UTRACE_UCNV_LOAD, UTRACE_UCNV_UNLOAD, UTRACE_COLLATION_START=0x2000, UTRACE_UCOL_OPEN=UTRACE_COLLATION_START, UTRACE_UCOL_CLOSE, UTRACE_UCOL_STRCOLL, UTRACE_UCOL_GET_SORTKEY, UTRACE_UCOL_GETLOCALE, UTRACE_UCOL_NEXTSORTKEYPART, UTRACE_UCOL_STRCOLLITER, UTRACE_UCOL_OPEN_FROM_SHORT_STRING, UTRACE_UCOL_STRCOLLUTF8, /**< @stable ICU 50 */ #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * The lowest resource/data location. * @stable ICU 65 */ UTRACE_UDATA_START=0x3000, /** * Indicates that a value was read from a resource bundle. Provides three * C-style strings to UTraceData: type, file name, and resource path. The * possible types are: * * - "string" (a string value was accessed) * - "binary" (a binary value was accessed) * - "intvector" (a integer vector value was accessed) * - "int" (a signed integer value was accessed) * - "uint" (a unsigned integer value was accessed) * - "get" (a path was loaded, but the value was not accessed) * - "getalias" (a path was loaded, and an alias was resolved) * * @stable ICU 65 */ UTRACE_UDATA_RESOURCE=UTRACE_UDATA_START, /** * Indicates that a resource bundle was opened. * * Provides one C-style string to UTraceData: file name. * @stable ICU 65 */ UTRACE_UDATA_BUNDLE, /** * Indicates that a data file was opened, but not *.res files. * * Provides one C-style string to UTraceData: file name. * * @stable ICU 65 */ UTRACE_UDATA_DATA_FILE, /** * Indicates that a *.res file was opened. * * This differs from UTRACE_UDATA_BUNDLE because a res file is typically * opened only once per application runtime, but the bundle corresponding * to that res file may be opened many times. * * Provides one C-style string to UTraceData: file name. * * @stable ICU 65 */ UTRACE_UDATA_RES_FILE, #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) } UTraceFunctionNumber; /** * Setter for the trace level. * @param traceLevel A UTraceLevel value. * @stable ICU 2.8 */ U_CAPI void U_EXPORT2 utrace_setLevel(int32_t traceLevel); /** * Getter for the trace level. * @return The UTraceLevel value being used by ICU. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 utrace_getLevel(void); /* Trace function pointers types ----------------------------- */ /** * Type signature for the trace function to be called when entering a function. * @param context value supplied at the time the trace functions are set. * @param fnNumber Enum value indicating the ICU function being entered. * @stable ICU 2.8 */ typedef void U_CALLCONV UTraceEntry(const void *context, int32_t fnNumber); /** * Type signature for the trace function to be called when exiting from a function. * @param context value supplied at the time the trace functions are set. * @param fnNumber Enum value indicating the ICU function being exited. * @param fmt A formatting string that describes the number and types * of arguments included with the variable args. The fmt * string has the same form as the utrace_vformat format * string. * @param args A variable arguments list. Contents are described by * the fmt parameter. * @see utrace_vformat * @stable ICU 2.8 */ typedef void U_CALLCONV UTraceExit(const void *context, int32_t fnNumber, const char *fmt, va_list args); /** * Type signature for the trace function to be called from within an ICU function * to display data or messages. * @param context value supplied at the time the trace functions are set. * @param fnNumber Enum value indicating the ICU function being exited. * @param level The current tracing level * @param fmt A format string describing the tracing data that is supplied * as variable args * @param args The data being traced, passed as variable args. * @stable ICU 2.8 */ typedef void U_CALLCONV UTraceData(const void *context, int32_t fnNumber, int32_t level, const char *fmt, va_list args); /** * Set ICU Tracing functions. Installs application-provided tracing * functions into ICU. After doing this, subsequent ICU operations * will call back to the installed functions, providing a trace * of the use of ICU. Passing a NULL pointer for a tracing function * is allowed, and inhibits tracing action at points where that function * would be called. *

* Tracing and Threads: Tracing functions are global to a process, and * will be called in response to ICU operations performed by any * thread. If tracing of an individual thread is desired, the * tracing functions must themselves filter by checking that the * current thread is the desired thread. * * @param context an uninterpreted pointer. Whatever is passed in * here will in turn be passed to each of the tracing * functions UTraceEntry, UTraceExit and UTraceData. * ICU does not use or alter this pointer. * @param e Callback function to be called on entry to a * a traced ICU function. * @param x Callback function to be called on exit from a * traced ICU function. * @param d Callback function to be called from within a * traced ICU function, for the purpose of providing * data to the trace. * * @stable ICU 2.8 */ U_CAPI void U_EXPORT2 utrace_setFunctions(const void *context, UTraceEntry *e, UTraceExit *x, UTraceData *d); /** * Get the currently installed ICU tracing functions. Note that a null function * pointer will be returned if no trace function has been set. * * @param context The currently installed tracing context. * @param e The currently installed UTraceEntry function. * @param x The currently installed UTraceExit function. * @param d The currently installed UTraceData function. * @stable ICU 2.8 */ U_CAPI void U_EXPORT2 utrace_getFunctions(const void **context, UTraceEntry **e, UTraceExit **x, UTraceData **d); /* * * ICU trace format string syntax * * Format Strings are passed to UTraceData functions, and define the * number and types of the trace data being passed on each call. * * The UTraceData function, which is supplied by the application, * not by ICU, can either forward the trace data (passed via * varargs) and the format string back to ICU for formatting into * a displayable string, or it can interpret the format itself, * and do as it wishes with the trace data. * * * Goals for the format string * - basic data output * - easy to use for trace programmer * - sufficient provision for data types for trace output readability * - well-defined types and binary portable APIs * * Non-goals * - printf compatibility * - fancy formatting * - argument reordering and other internationalization features * * ICU trace format strings contain plain text with argument inserts, * much like standard printf format strings. * Each insert begins with a '%', then optionally contains a 'v', * then exactly one type character. * Two '%' in a row represent a '%' instead of an insert. * The trace format strings need not have \n at the end. * * * Types * ----- * * Type characters: * - c A char character in the default codepage. * - s A NUL-terminated char * string in the default codepage. * - S A UChar * string. Requires two params, (ptr, length). Length=-1 for nul term. * - b A byte (8-bit integer). * - h A 16-bit integer. Also a 16 bit Unicode code unit. * - d A 32-bit integer. Also a 20 bit Unicode code point value. * - l A 64-bit integer. * - p A data pointer. * * Vectors * ------- * * If the 'v' is not specified, then one item of the specified type * is passed in. * If the 'v' (for "vector") is specified, then a vector of items of the * specified type is passed in, via a pointer to the first item * and an int32_t value for the length of the vector. * Length==-1 means zero or NUL termination. Works for vectors of all types. * * Note: %vS is a vector of (UChar *) strings. The strings must * be nul terminated as there is no way to provide a * separate length parameter for each string. The length * parameter (required for all vectors) is the number of * strings, not the length of the strings. * * Examples * -------- * * These examples show the parameters that will be passed to an application's * UTraceData() function for various formats. * * - the precise formatting is up to the application! * - the examples use type casts for arguments only to _show_ the types of * arguments without needing variable declarations in the examples; * the type casts will not be necessary in actual code * * UTraceDataFunc(context, fnNumber, level, * "There is a character %c in the string %s.", // Format String * (char)c, (const char *)s); // varargs parameters * -> There is a character 0x42 'B' in the string "Bravo". * * UTraceDataFunc(context, fnNumber, level, * "Vector of bytes %vb vector of chars %vc", * (const uint8_t *)bytes, (int32_t)bytesLength, * (const char *)chars, (int32_t)charsLength); * -> Vector of bytes * 42 63 64 3f [4] * vector of chars * "Bcd?"[4] * * UTraceDataFunc(context, fnNumber, level, * "An int32_t %d and a whole bunch of them %vd", * (int32_t)-5, (const int32_t *)ints, (int32_t)intsLength); * -> An int32_t 0xfffffffb and a whole bunch of them * fffffffb 00000005 0000010a [3] * */ /** * Trace output Formatter. An application's UTraceData tracing functions may call * back to this function to format the trace output in a * human readable form. Note that a UTraceData function may choose * to not format the data; it could, for example, save it in * in the raw form it was received (more compact), leaving * formatting for a later trace analysis tool. * @param outBuf pointer to a buffer to receive the formatted output. Output * will be nul terminated if there is space in the buffer - * if the length of the requested output < the output buffer size. * @param capacity Length of the output buffer. * @param indent Number of spaces to indent the output. Intended to allow * data displayed from nested functions to be indented for readability. * @param fmt Format specification for the data to output * @param args Data to be formatted. * @return Length of formatted output, including the terminating NUL. * If buffer capacity is insufficient, the required capacity is returned. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 utrace_vformat(char *outBuf, int32_t capacity, int32_t indent, const char *fmt, va_list args); /** * Trace output Formatter. An application's UTraceData tracing functions may call * this function to format any additional trace data, beyond that * provided by default, in human readable form with the same * formatting conventions used by utrace_vformat(). * @param outBuf pointer to a buffer to receive the formatted output. Output * will be nul terminated if there is space in the buffer - * if the length of the requested output < the output buffer size. * @param capacity Length of the output buffer. * @param indent Number of spaces to indent the output. Intended to allow * data displayed from nested functions to be indented for readability. * @param fmt Format specification for the data to output * @param ... Data to be formatted. * @return Length of formatted output, including the terminating NUL. * If buffer capacity is insufficient, the required capacity is returned. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 utrace_format(char *outBuf, int32_t capacity, int32_t indent, const char *fmt, ...); /* Trace function numbers --------------------------------------------------- */ /** * Get the name of a function from its trace function number. * * @param fnNumber The trace number for an ICU function. * @return The name string for the function. * * @see UTraceFunctionNumber * @stable ICU 2.8 */ U_CAPI const char * U_EXPORT2 utrace_functionName(int32_t fnNumber); U_CDECL_END #endif // ustringtrie.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2010-2012, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: udicttrie.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2010dec17 * created by: Markus W. Scherer */ #ifndef __USTRINGTRIE_H__ #define __USTRINGTRIE_H__ /** * \file * \brief C API: Helper definitions for dictionary trie APIs. */ /** * Return values for BytesTrie::next(), UCharsTrie::next() and similar methods. * @see USTRINGTRIE_MATCHES * @see USTRINGTRIE_HAS_VALUE * @see USTRINGTRIE_HAS_NEXT * @stable ICU 4.8 */ enum UStringTrieResult { /** * The input unit(s) did not continue a matching string. * Once current()/next() return USTRINGTRIE_NO_MATCH, * all further calls to current()/next() will also return USTRINGTRIE_NO_MATCH, * until the trie is reset to its original state or to a saved state. * @stable ICU 4.8 */ USTRINGTRIE_NO_MATCH, /** * The input unit(s) continued a matching string * but there is no value for the string so far. * (It is a prefix of a longer string.) * @stable ICU 4.8 */ USTRINGTRIE_NO_VALUE, /** * The input unit(s) continued a matching string * and there is a value for the string so far. * This value will be returned by getValue(). * No further input byte/unit can continue a matching string. * @stable ICU 4.8 */ USTRINGTRIE_FINAL_VALUE, /** * The input unit(s) continued a matching string * and there is a value for the string so far. * This value will be returned by getValue(). * Another input byte/unit can continue a matching string. * @stable ICU 4.8 */ USTRINGTRIE_INTERMEDIATE_VALUE }; /** * Same as (result!=USTRINGTRIE_NO_MATCH). * @param result A result from BytesTrie::first(), UCharsTrie::next() etc. * @return true if the input bytes/units so far are part of a matching string/byte sequence. * @stable ICU 4.8 */ #define USTRINGTRIE_MATCHES(result) ((result)!=USTRINGTRIE_NO_MATCH) /** * Equivalent to (result==USTRINGTRIE_INTERMEDIATE_VALUE || result==USTRINGTRIE_FINAL_VALUE) but * this macro evaluates result exactly once. * @param result A result from BytesTrie::first(), UCharsTrie::next() etc. * @return true if there is a value for the input bytes/units so far. * @see BytesTrie::getValue * @see UCharsTrie::getValue * @stable ICU 4.8 */ #define USTRINGTRIE_HAS_VALUE(result) ((result)>=USTRINGTRIE_FINAL_VALUE) /** * Equivalent to (result==USTRINGTRIE_NO_VALUE || result==USTRINGTRIE_INTERMEDIATE_VALUE) but * this macro evaluates result exactly once. * @param result A result from BytesTrie::first(), UCharsTrie::next() etc. * @return true if another input byte/unit can continue a matching string. * @stable ICU 4.8 */ #define USTRINGTRIE_HAS_NEXT(result) ((result)&1) #endif /* __USTRINGTRIE_H__ */ // ushape.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 2000-2012, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: ushape.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2000jun29 * created by: Markus W. Scherer */ #ifndef __USHAPE_H__ #define __USHAPE_H__ /** * \file * \brief C API: Arabic shaping * */ /** * Shape Arabic text on a character basis. * *

This function performs basic operations for "shaping" Arabic text. It is most * useful for use with legacy data formats and legacy display technology * (simple terminals). All operations are performed on Unicode characters.

* *

Text-based shaping means that some character code points in the text are * replaced by others depending on the context. It transforms one kind of text * into another. In comparison, modern displays for Arabic text select * appropriate, context-dependent font glyphs for each text element, which means * that they transform text into a glyph vector.

* *

Text transformations are necessary when modern display technology is not * available or when text needs to be transformed to or from legacy formats that * use "shaped" characters. Since the Arabic script is cursive, connecting * adjacent letters to each other, computers select images for each letter based * on the surrounding letters. This usually results in four images per Arabic * letter: initial, middle, final, and isolated forms. In Unicode, on the other * hand, letters are normally stored abstract, and a display system is expected * to select the necessary glyphs. (This makes searching and other text * processing easier because the same letter has only one code.) It is possible * to mimic this with text transformations because there are characters in * Unicode that are rendered as letters with a specific shape * (or cursive connectivity). They were included for interoperability with * legacy systems and codepages, and for unsophisticated display systems.

* *

A second kind of text transformations is supported for Arabic digits: * For compatibility with legacy codepages that only include European digits, * it is possible to replace one set of digits by another, changing the * character code points. These operations can be performed for either * Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic * digits (U+06f0...U+06f9).

* *

Some replacements may result in more or fewer characters (code points). * By default, this means that the destination buffer may receive text with a * length different from the source length. Some legacy systems rely on the * length of the text to be constant. They expect extra spaces to be added * or consumed either next to the affected character or at the end of the * text.

* *

For details about the available operations, see the description of the * U_SHAPE_... options.

* * @param source The input text. * * @param sourceLength The number of UChars in source. * * @param dest The destination buffer that will receive the results of the * requested operations. It may be NULL only if * destSize is 0. The source and destination must not * overlap. * * @param destSize The size (capacity) of the destination buffer in UChars. * If destSize is 0, then no output is produced, * but the necessary buffer size is returned ("preflighting"). * * @param options This is a 32-bit set of flags that specify the operations * that are performed on the input text. If no error occurs, * then the result will always be written to the destination * buffer. * * @param pErrorCode must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @return The number of UChars written to the destination buffer. * If an error occurred, then no output was written, or it may be * incomplete. If U_BUFFER_OVERFLOW_ERROR is set, then * the return value indicates the necessary destination buffer size. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_shapeArabic(const UChar *source, int32_t sourceLength, UChar *dest, int32_t destSize, uint32_t options, UErrorCode *pErrorCode); /** * Memory option: allow the result to have a different length than the source. * Affects: LamAlef options * @stable ICU 2.0 */ #define U_SHAPE_LENGTH_GROW_SHRINK 0 /** * Memory option: allow the result to have a different length than the source. * Affects: LamAlef options * This option is an alias to U_SHAPE_LENGTH_GROW_SHRINK * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_RESIZE 0 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces next to modified characters. * @stable ICU 2.0 */ #define U_SHAPE_LENGTH_FIXED_SPACES_NEAR 1 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces next to modified characters. * Affects: LamAlef options * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_NEAR * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_NEAR 1 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the end of the text. * @stable ICU 2.0 */ #define U_SHAPE_LENGTH_FIXED_SPACES_AT_END 2 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the end of the text. * Affects: LamAlef options * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_END * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_END 2 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the beginning of the text. * @stable ICU 2.0 */ #define U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING 3 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the beginning of the text. * Affects: LamAlef options * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_BEGIN 3 /** * Memory option: the result must have the same length as the source. * Shaping Mode: For each LAMALEF character found, expand LAMALEF using space at end. * If there is no space at end, use spaces at beginning of the buffer. If there * is no space at beginning of the buffer, use spaces at the near (i.e. the space * after the LAMALEF character). * If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) * will be set in pErrorCode * * Deshaping Mode: Perform the same function as the flag equals U_SHAPE_LAMALEF_END. * Affects: LamAlef options * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_AUTO 0x10000 /** Bit mask for memory options. @stable ICU 2.0 */ #define U_SHAPE_LENGTH_MASK 0x10003 /* Changed old value 3 */ /** * Bit mask for LamAlef memory options. * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_MASK 0x10003 /* updated */ /** Direction indicator: the source is in logical (keyboard) order. @stable ICU 2.0 */ #define U_SHAPE_TEXT_DIRECTION_LOGICAL 0 /** * Direction indicator: * the source is in visual RTL order, * the rightmost displayed character stored first. * This option is an alias to U_SHAPE_TEXT_DIRECTION_LOGICAL * @stable ICU 4.2 */ #define U_SHAPE_TEXT_DIRECTION_VISUAL_RTL 0 /** * Direction indicator: * the source is in visual LTR order, * the leftmost displayed character stored first. * @stable ICU 2.0 */ #define U_SHAPE_TEXT_DIRECTION_VISUAL_LTR 4 /** Bit mask for direction indicators. @stable ICU 2.0 */ #define U_SHAPE_TEXT_DIRECTION_MASK 4 /** Letter shaping option: do not perform letter shaping. @stable ICU 2.0 */ #define U_SHAPE_LETTERS_NOOP 0 /** Letter shaping option: replace abstract letter characters by "shaped" ones. @stable ICU 2.0 */ #define U_SHAPE_LETTERS_SHAPE 8 /** Letter shaping option: replace "shaped" letter characters by abstract ones. @stable ICU 2.0 */ #define U_SHAPE_LETTERS_UNSHAPE 0x10 /** * Letter shaping option: replace abstract letter characters by "shaped" ones. * The only difference with U_SHAPE_LETTERS_SHAPE is that Tashkeel letters * are always "shaped" into the isolated form instead of the medial form * (selecting code points from the Arabic Presentation Forms-B block). * @stable ICU 2.0 */ #define U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED 0x18 /** Bit mask for letter shaping options. @stable ICU 2.0 */ #define U_SHAPE_LETTERS_MASK 0x18 /** Digit shaping option: do not perform digit shaping. @stable ICU 2.0 */ #define U_SHAPE_DIGITS_NOOP 0 /** * Digit shaping option: * Replace European digits (U+0030...) by Arabic-Indic digits. * @stable ICU 2.0 */ #define U_SHAPE_DIGITS_EN2AN 0x20 /** * Digit shaping option: * Replace Arabic-Indic digits by European digits (U+0030...). * @stable ICU 2.0 */ #define U_SHAPE_DIGITS_AN2EN 0x40 /** * Digit shaping option: * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent * strongly directional character is an Arabic letter * (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]).
* The direction of "preceding" depends on the direction indicator option. * For the first characters, the preceding strongly directional character * (initial state) is assumed to be not an Arabic letter * (it is U_LEFT_TO_RIGHT [L] or U_RIGHT_TO_LEFT [R]). * @stable ICU 2.0 */ #define U_SHAPE_DIGITS_ALEN2AN_INIT_LR 0x60 /** * Digit shaping option: * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent * strongly directional character is an Arabic letter * (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]).
* The direction of "preceding" depends on the direction indicator option. * For the first characters, the preceding strongly directional character * (initial state) is assumed to be an Arabic letter. * @stable ICU 2.0 */ #define U_SHAPE_DIGITS_ALEN2AN_INIT_AL 0x80 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */ #define U_SHAPE_DIGITS_RESERVED 0xa0 /** Bit mask for digit shaping options. @stable ICU 2.0 */ #define U_SHAPE_DIGITS_MASK 0xe0 /** Digit type option: Use Arabic-Indic digits (U+0660...U+0669). @stable ICU 2.0 */ #define U_SHAPE_DIGIT_TYPE_AN 0 /** Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). @stable ICU 2.0 */ #define U_SHAPE_DIGIT_TYPE_AN_EXTENDED 0x100 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */ #define U_SHAPE_DIGIT_TYPE_RESERVED 0x200 /** Bit mask for digit type options. @stable ICU 2.0 */ #define U_SHAPE_DIGIT_TYPE_MASK 0x300 /* I need to change this from 0x3f00 to 0x300 */ /** * Tashkeel aggregation option: * Replaces any combination of U+0651 with one of * U+064C, U+064D, U+064E, U+064F, U+0650 with * U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively. * @stable ICU 3.6 */ #define U_SHAPE_AGGREGATE_TASHKEEL 0x4000 /** Tashkeel aggregation option: do not aggregate tashkeels. @stable ICU 3.6 */ #define U_SHAPE_AGGREGATE_TASHKEEL_NOOP 0 /** Bit mask for tashkeel aggregation. @stable ICU 3.6 */ #define U_SHAPE_AGGREGATE_TASHKEEL_MASK 0x4000 /** * Presentation form option: * Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B * characters with 0+06xx characters, before shaping. * @stable ICU 3.6 */ #define U_SHAPE_PRESERVE_PRESENTATION 0x8000 /** Presentation form option: * Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with * their unshaped correspondents in range 0+06xx, before shaping. * @stable ICU 3.6 */ #define U_SHAPE_PRESERVE_PRESENTATION_NOOP 0 /** Bit mask for preserve presentation form. @stable ICU 3.6 */ #define U_SHAPE_PRESERVE_PRESENTATION_MASK 0x8000 /* Seen Tail option */ /** * Memory option: the result must have the same length as the source. * Shaping mode: The SEEN family character will expand into two characters using space near * the SEEN family character(i.e. the space after the character). * If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) * will be set in pErrorCode * * De-shaping mode: Any Seen character followed by Tail character will be * replaced by one cell Seen and a space will replace the Tail. * Affects: Seen options * @stable ICU 4.2 */ #define U_SHAPE_SEEN_TWOCELL_NEAR 0x200000 /** * Bit mask for Seen memory options. * @stable ICU 4.2 */ #define U_SHAPE_SEEN_MASK 0x700000 /* YehHamza option */ /** * Memory option: the result must have the same length as the source. * Shaping mode: The YEHHAMZA character will expand into two characters using space near it * (i.e. the space after the character * If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) * will be set in pErrorCode * * De-shaping mode: Any Yeh (final or isolated) character followed by Hamza character will be * replaced by one cell YehHamza and space will replace the Hamza. * Affects: YehHamza options * @stable ICU 4.2 */ #define U_SHAPE_YEHHAMZA_TWOCELL_NEAR 0x1000000 /** * Bit mask for YehHamza memory options. * @stable ICU 4.2 */ #define U_SHAPE_YEHHAMZA_MASK 0x3800000 /* New Tashkeel options */ /** * Memory option: the result must have the same length as the source. * Shaping mode: Tashkeel characters will be replaced by spaces. * Spaces will be placed at beginning of the buffer * * De-shaping mode: N/A * Affects: Tashkeel options * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_BEGIN 0x40000 /** * Memory option: the result must have the same length as the source. * Shaping mode: Tashkeel characters will be replaced by spaces. * Spaces will be placed at end of the buffer * * De-shaping mode: N/A * Affects: Tashkeel options * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_END 0x60000 /** * Memory option: allow the result to have a different length than the source. * Shaping mode: Tashkeel characters will be removed, buffer length will shrink. * De-shaping mode: N/A * * Affect: Tashkeel options * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_RESIZE 0x80000 /** * Memory option: the result must have the same length as the source. * Shaping mode: Tashkeel characters will be replaced by Tatweel if it is connected to adjacent * characters (i.e. shaped on Tatweel) or replaced by space if it is not connected. * * De-shaping mode: N/A * Affects: YehHamza options * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL 0xC0000 /** * Bit mask for Tashkeel replacement with Space or Tatweel memory options. * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_MASK 0xE0000 /* Space location Control options */ /** * This option affect the meaning of BEGIN and END options. if this option is not used the default * for BEGIN and END will be as following: * The Default (for both Visual LTR, Visual RTL and Logical Text) * 1. BEGIN always refers to the start address of physical memory. * 2. END always refers to the end address of physical memory. * * If this option is used it will swap the meaning of BEGIN and END only for Visual LTR text. * * The effect on BEGIN and END Memory Options will be as following: * A. BEGIN For Visual LTR text: This will be the beginning (right side) of the visual text( * corresponding to the physical memory address end for Visual LTR text, Same as END in * default behavior) * B. BEGIN For Logical text: Same as BEGIN in default behavior. * C. END For Visual LTR text: This will be the end (left side) of the visual text (corresponding * to the physical memory address beginning for Visual LTR text, Same as BEGIN in default behavior. * D. END For Logical text: Same as END in default behavior). * Affects: All LamAlef BEGIN, END and AUTO options. * @stable ICU 4.2 */ #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END 0x4000000 /** * Bit mask for swapping BEGIN and END for Visual LTR text * @stable ICU 4.2 */ #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_MASK 0x4000000 /** * If this option is used, shaping will use the new Unicode code point for TAIL (i.e. 0xFE73). * If this option is not specified (Default), old unofficial Unicode TAIL code point is used (i.e. 0x200B) * De-shaping will not use this option as it will always search for both the new Unicode code point for the * TAIL (i.e. 0xFE73) or the old unofficial Unicode TAIL code point (i.e. 0x200B) and de-shape the * Seen-Family letter accordingly. * * Shaping Mode: Only shaping. * De-shaping Mode: N/A. * Affects: All Seen options * @stable ICU 4.8 */ #define U_SHAPE_TAIL_NEW_UNICODE 0x8000000 /** * Bit mask for new Unicode Tail option * @stable ICU 4.8 */ #define U_SHAPE_TAIL_TYPE_MASK 0x8000000 #endif // uscript.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File USCRIPT.H * * Modification History: * * Date Name Description * 07/06/2001 Ram Creation. ****************************************************************************** */ #ifndef USCRIPT_H #define USCRIPT_H /** * \file * \brief C API: Unicode Script Information */ /** * Constants for ISO 15924 script codes. * * The current set of script code constants supports at least all scripts * that are encoded in the version of Unicode which ICU currently supports. * The names of the constants are usually derived from the * Unicode script property value aliases. * See UAX #24 Unicode Script Property (http://www.unicode.org/reports/tr24/) * and http://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt . * * In addition, constants for many ISO 15924 script codes * are included, for use with language tags, CLDR data, and similar. * Some of those codes are not used in the Unicode Character Database (UCD). * For example, there are no characters that have a UCD script property value of * Hans or Hant. All Han ideographs have the Hani script property value in Unicode. * * Private-use codes Qaaa..Qabx are not included, except as used in the UCD or in CLDR. * * Starting with ICU 55, script codes are only added when their scripts * have been or will certainly be encoded in Unicode, * and have been assigned Unicode script property value aliases, * to ensure that their script names are stable and match the names of the constants. * Script codes like Latf and Aran that are not subject to separate encoding * may be added at any time. * * @stable ICU 2.2 */ typedef enum UScriptCode { /* * Note: UScriptCode constants and their ISO script code comments * are parsed by preparseucd.py. * It matches lines like * USCRIPT_ = , / * * / */ /** @stable ICU 2.2 */ USCRIPT_INVALID_CODE = -1, /** @stable ICU 2.2 */ USCRIPT_COMMON = 0, /* Zyyy */ /** @stable ICU 2.2 */ USCRIPT_INHERITED = 1, /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */ /** @stable ICU 2.2 */ USCRIPT_ARABIC = 2, /* Arab */ /** @stable ICU 2.2 */ USCRIPT_ARMENIAN = 3, /* Armn */ /** @stable ICU 2.2 */ USCRIPT_BENGALI = 4, /* Beng */ /** @stable ICU 2.2 */ USCRIPT_BOPOMOFO = 5, /* Bopo */ /** @stable ICU 2.2 */ USCRIPT_CHEROKEE = 6, /* Cher */ /** @stable ICU 2.2 */ USCRIPT_COPTIC = 7, /* Copt */ /** @stable ICU 2.2 */ USCRIPT_CYRILLIC = 8, /* Cyrl */ /** @stable ICU 2.2 */ USCRIPT_DESERET = 9, /* Dsrt */ /** @stable ICU 2.2 */ USCRIPT_DEVANAGARI = 10, /* Deva */ /** @stable ICU 2.2 */ USCRIPT_ETHIOPIC = 11, /* Ethi */ /** @stable ICU 2.2 */ USCRIPT_GEORGIAN = 12, /* Geor */ /** @stable ICU 2.2 */ USCRIPT_GOTHIC = 13, /* Goth */ /** @stable ICU 2.2 */ USCRIPT_GREEK = 14, /* Grek */ /** @stable ICU 2.2 */ USCRIPT_GUJARATI = 15, /* Gujr */ /** @stable ICU 2.2 */ USCRIPT_GURMUKHI = 16, /* Guru */ /** @stable ICU 2.2 */ USCRIPT_HAN = 17, /* Hani */ /** @stable ICU 2.2 */ USCRIPT_HANGUL = 18, /* Hang */ /** @stable ICU 2.2 */ USCRIPT_HEBREW = 19, /* Hebr */ /** @stable ICU 2.2 */ USCRIPT_HIRAGANA = 20, /* Hira */ /** @stable ICU 2.2 */ USCRIPT_KANNADA = 21, /* Knda */ /** @stable ICU 2.2 */ USCRIPT_KATAKANA = 22, /* Kana */ /** @stable ICU 2.2 */ USCRIPT_KHMER = 23, /* Khmr */ /** @stable ICU 2.2 */ USCRIPT_LAO = 24, /* Laoo */ /** @stable ICU 2.2 */ USCRIPT_LATIN = 25, /* Latn */ /** @stable ICU 2.2 */ USCRIPT_MALAYALAM = 26, /* Mlym */ /** @stable ICU 2.2 */ USCRIPT_MONGOLIAN = 27, /* Mong */ /** @stable ICU 2.2 */ USCRIPT_MYANMAR = 28, /* Mymr */ /** @stable ICU 2.2 */ USCRIPT_OGHAM = 29, /* Ogam */ /** @stable ICU 2.2 */ USCRIPT_OLD_ITALIC = 30, /* Ital */ /** @stable ICU 2.2 */ USCRIPT_ORIYA = 31, /* Orya */ /** @stable ICU 2.2 */ USCRIPT_RUNIC = 32, /* Runr */ /** @stable ICU 2.2 */ USCRIPT_SINHALA = 33, /* Sinh */ /** @stable ICU 2.2 */ USCRIPT_SYRIAC = 34, /* Syrc */ /** @stable ICU 2.2 */ USCRIPT_TAMIL = 35, /* Taml */ /** @stable ICU 2.2 */ USCRIPT_TELUGU = 36, /* Telu */ /** @stable ICU 2.2 */ USCRIPT_THAANA = 37, /* Thaa */ /** @stable ICU 2.2 */ USCRIPT_THAI = 38, /* Thai */ /** @stable ICU 2.2 */ USCRIPT_TIBETAN = 39, /* Tibt */ /** Canadian_Aboriginal script. @stable ICU 2.6 */ USCRIPT_CANADIAN_ABORIGINAL = 40, /* Cans */ /** Canadian_Aboriginal script (alias). @stable ICU 2.2 */ USCRIPT_UCAS = USCRIPT_CANADIAN_ABORIGINAL, /** @stable ICU 2.2 */ USCRIPT_YI = 41, /* Yiii */ /* New scripts in Unicode 3.2 */ /** @stable ICU 2.2 */ USCRIPT_TAGALOG = 42, /* Tglg */ /** @stable ICU 2.2 */ USCRIPT_HANUNOO = 43, /* Hano */ /** @stable ICU 2.2 */ USCRIPT_BUHID = 44, /* Buhd */ /** @stable ICU 2.2 */ USCRIPT_TAGBANWA = 45, /* Tagb */ /* New scripts in Unicode 4 */ /** @stable ICU 2.6 */ USCRIPT_BRAILLE = 46, /* Brai */ /** @stable ICU 2.6 */ USCRIPT_CYPRIOT = 47, /* Cprt */ /** @stable ICU 2.6 */ USCRIPT_LIMBU = 48, /* Limb */ /** @stable ICU 2.6 */ USCRIPT_LINEAR_B = 49, /* Linb */ /** @stable ICU 2.6 */ USCRIPT_OSMANYA = 50, /* Osma */ /** @stable ICU 2.6 */ USCRIPT_SHAVIAN = 51, /* Shaw */ /** @stable ICU 2.6 */ USCRIPT_TAI_LE = 52, /* Tale */ /** @stable ICU 2.6 */ USCRIPT_UGARITIC = 53, /* Ugar */ /** New script code in Unicode 4.0.1 @stable ICU 3.0 */ USCRIPT_KATAKANA_OR_HIRAGANA = 54,/*Hrkt */ /* New scripts in Unicode 4.1 */ /** @stable ICU 3.4 */ USCRIPT_BUGINESE = 55, /* Bugi */ /** @stable ICU 3.4 */ USCRIPT_GLAGOLITIC = 56, /* Glag */ /** @stable ICU 3.4 */ USCRIPT_KHAROSHTHI = 57, /* Khar */ /** @stable ICU 3.4 */ USCRIPT_SYLOTI_NAGRI = 58, /* Sylo */ /** @stable ICU 3.4 */ USCRIPT_NEW_TAI_LUE = 59, /* Talu */ /** @stable ICU 3.4 */ USCRIPT_TIFINAGH = 60, /* Tfng */ /** @stable ICU 3.4 */ USCRIPT_OLD_PERSIAN = 61, /* Xpeo */ /* New script codes from Unicode and ISO 15924 */ /** @stable ICU 3.6 */ USCRIPT_BALINESE = 62, /* Bali */ /** @stable ICU 3.6 */ USCRIPT_BATAK = 63, /* Batk */ /** @stable ICU 3.6 */ USCRIPT_BLISSYMBOLS = 64, /* Blis */ /** @stable ICU 3.6 */ USCRIPT_BRAHMI = 65, /* Brah */ /** @stable ICU 3.6 */ USCRIPT_CHAM = 66, /* Cham */ /** @stable ICU 3.6 */ USCRIPT_CIRTH = 67, /* Cirt */ /** @stable ICU 3.6 */ USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC = 68, /* Cyrs */ /** @stable ICU 3.6 */ USCRIPT_DEMOTIC_EGYPTIAN = 69, /* Egyd */ /** @stable ICU 3.6 */ USCRIPT_HIERATIC_EGYPTIAN = 70, /* Egyh */ /** @stable ICU 3.6 */ USCRIPT_EGYPTIAN_HIEROGLYPHS = 71, /* Egyp */ /** @stable ICU 3.6 */ USCRIPT_KHUTSURI = 72, /* Geok */ /** @stable ICU 3.6 */ USCRIPT_SIMPLIFIED_HAN = 73, /* Hans */ /** @stable ICU 3.6 */ USCRIPT_TRADITIONAL_HAN = 74, /* Hant */ /** @stable ICU 3.6 */ USCRIPT_PAHAWH_HMONG = 75, /* Hmng */ /** @stable ICU 3.6 */ USCRIPT_OLD_HUNGARIAN = 76, /* Hung */ /** @stable ICU 3.6 */ USCRIPT_HARAPPAN_INDUS = 77, /* Inds */ /** @stable ICU 3.6 */ USCRIPT_JAVANESE = 78, /* Java */ /** @stable ICU 3.6 */ USCRIPT_KAYAH_LI = 79, /* Kali */ /** @stable ICU 3.6 */ USCRIPT_LATIN_FRAKTUR = 80, /* Latf */ /** @stable ICU 3.6 */ USCRIPT_LATIN_GAELIC = 81, /* Latg */ /** @stable ICU 3.6 */ USCRIPT_LEPCHA = 82, /* Lepc */ /** @stable ICU 3.6 */ USCRIPT_LINEAR_A = 83, /* Lina */ /** @stable ICU 4.6 */ USCRIPT_MANDAIC = 84, /* Mand */ /** @stable ICU 3.6 */ USCRIPT_MANDAEAN = USCRIPT_MANDAIC, /** @stable ICU 3.6 */ USCRIPT_MAYAN_HIEROGLYPHS = 85, /* Maya */ /** @stable ICU 4.6 */ USCRIPT_MEROITIC_HIEROGLYPHS = 86, /* Mero */ /** @stable ICU 3.6 */ USCRIPT_MEROITIC = USCRIPT_MEROITIC_HIEROGLYPHS, /** @stable ICU 3.6 */ USCRIPT_NKO = 87, /* Nkoo */ /** @stable ICU 3.6 */ USCRIPT_ORKHON = 88, /* Orkh */ /** @stable ICU 3.6 */ USCRIPT_OLD_PERMIC = 89, /* Perm */ /** @stable ICU 3.6 */ USCRIPT_PHAGS_PA = 90, /* Phag */ /** @stable ICU 3.6 */ USCRIPT_PHOENICIAN = 91, /* Phnx */ /** @stable ICU 52 */ USCRIPT_MIAO = 92, /* Plrd */ /** @stable ICU 3.6 */ USCRIPT_PHONETIC_POLLARD = USCRIPT_MIAO, /** @stable ICU 3.6 */ USCRIPT_RONGORONGO = 93, /* Roro */ /** @stable ICU 3.6 */ USCRIPT_SARATI = 94, /* Sara */ /** @stable ICU 3.6 */ USCRIPT_ESTRANGELO_SYRIAC = 95, /* Syre */ /** @stable ICU 3.6 */ USCRIPT_WESTERN_SYRIAC = 96, /* Syrj */ /** @stable ICU 3.6 */ USCRIPT_EASTERN_SYRIAC = 97, /* Syrn */ /** @stable ICU 3.6 */ USCRIPT_TENGWAR = 98, /* Teng */ /** @stable ICU 3.6 */ USCRIPT_VAI = 99, /* Vaii */ /** @stable ICU 3.6 */ USCRIPT_VISIBLE_SPEECH = 100,/* Visp */ /** @stable ICU 3.6 */ USCRIPT_CUNEIFORM = 101,/* Xsux */ /** @stable ICU 3.6 */ USCRIPT_UNWRITTEN_LANGUAGES = 102,/* Zxxx */ /** @stable ICU 3.6 */ USCRIPT_UNKNOWN = 103,/* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */ /** @stable ICU 3.8 */ USCRIPT_CARIAN = 104,/* Cari */ /** @stable ICU 3.8 */ USCRIPT_JAPANESE = 105,/* Jpan */ /** @stable ICU 3.8 */ USCRIPT_LANNA = 106,/* Lana */ /** @stable ICU 3.8 */ USCRIPT_LYCIAN = 107,/* Lyci */ /** @stable ICU 3.8 */ USCRIPT_LYDIAN = 108,/* Lydi */ /** @stable ICU 3.8 */ USCRIPT_OL_CHIKI = 109,/* Olck */ /** @stable ICU 3.8 */ USCRIPT_REJANG = 110,/* Rjng */ /** @stable ICU 3.8 */ USCRIPT_SAURASHTRA = 111,/* Saur */ /** Sutton SignWriting @stable ICU 3.8 */ USCRIPT_SIGN_WRITING = 112,/* Sgnw */ /** @stable ICU 3.8 */ USCRIPT_SUNDANESE = 113,/* Sund */ /** @stable ICU 3.8 */ USCRIPT_MOON = 114,/* Moon */ /** @stable ICU 3.8 */ USCRIPT_MEITEI_MAYEK = 115,/* Mtei */ /** @stable ICU 4.0 */ USCRIPT_IMPERIAL_ARAMAIC = 116,/* Armi */ /** @stable ICU 4.0 */ USCRIPT_AVESTAN = 117,/* Avst */ /** @stable ICU 4.0 */ USCRIPT_CHAKMA = 118,/* Cakm */ /** @stable ICU 4.0 */ USCRIPT_KOREAN = 119,/* Kore */ /** @stable ICU 4.0 */ USCRIPT_KAITHI = 120,/* Kthi */ /** @stable ICU 4.0 */ USCRIPT_MANICHAEAN = 121,/* Mani */ /** @stable ICU 4.0 */ USCRIPT_INSCRIPTIONAL_PAHLAVI = 122,/* Phli */ /** @stable ICU 4.0 */ USCRIPT_PSALTER_PAHLAVI = 123,/* Phlp */ /** @stable ICU 4.0 */ USCRIPT_BOOK_PAHLAVI = 124,/* Phlv */ /** @stable ICU 4.0 */ USCRIPT_INSCRIPTIONAL_PARTHIAN = 125,/* Prti */ /** @stable ICU 4.0 */ USCRIPT_SAMARITAN = 126,/* Samr */ /** @stable ICU 4.0 */ USCRIPT_TAI_VIET = 127,/* Tavt */ /** @stable ICU 4.0 */ USCRIPT_MATHEMATICAL_NOTATION = 128,/* Zmth */ /** @stable ICU 4.0 */ USCRIPT_SYMBOLS = 129,/* Zsym */ /** @stable ICU 4.4 */ USCRIPT_BAMUM = 130,/* Bamu */ /** @stable ICU 4.4 */ USCRIPT_LISU = 131,/* Lisu */ /** @stable ICU 4.4 */ USCRIPT_NAKHI_GEBA = 132,/* Nkgb */ /** @stable ICU 4.4 */ USCRIPT_OLD_SOUTH_ARABIAN = 133,/* Sarb */ /** @stable ICU 4.6 */ USCRIPT_BASSA_VAH = 134,/* Bass */ /** @stable ICU 54 */ USCRIPT_DUPLOYAN = 135,/* Dupl */ /** @stable ICU 4.6 */ USCRIPT_ELBASAN = 136,/* Elba */ /** @stable ICU 4.6 */ USCRIPT_GRANTHA = 137,/* Gran */ /** @stable ICU 4.6 */ USCRIPT_KPELLE = 138,/* Kpel */ /** @stable ICU 4.6 */ USCRIPT_LOMA = 139,/* Loma */ /** Mende Kikakui @stable ICU 4.6 */ USCRIPT_MENDE = 140,/* Mend */ /** @stable ICU 4.6 */ USCRIPT_MEROITIC_CURSIVE = 141,/* Merc */ /** @stable ICU 4.6 */ USCRIPT_OLD_NORTH_ARABIAN = 142,/* Narb */ /** @stable ICU 4.6 */ USCRIPT_NABATAEAN = 143,/* Nbat */ /** @stable ICU 4.6 */ USCRIPT_PALMYRENE = 144,/* Palm */ /** @stable ICU 54 */ USCRIPT_KHUDAWADI = 145,/* Sind */ /** @stable ICU 4.6 */ USCRIPT_SINDHI = USCRIPT_KHUDAWADI, /** @stable ICU 4.6 */ USCRIPT_WARANG_CITI = 146,/* Wara */ /** @stable ICU 4.8 */ USCRIPT_AFAKA = 147,/* Afak */ /** @stable ICU 4.8 */ USCRIPT_JURCHEN = 148,/* Jurc */ /** @stable ICU 4.8 */ USCRIPT_MRO = 149,/* Mroo */ /** @stable ICU 4.8 */ USCRIPT_NUSHU = 150,/* Nshu */ /** @stable ICU 4.8 */ USCRIPT_SHARADA = 151,/* Shrd */ /** @stable ICU 4.8 */ USCRIPT_SORA_SOMPENG = 152,/* Sora */ /** @stable ICU 4.8 */ USCRIPT_TAKRI = 153,/* Takr */ /** @stable ICU 4.8 */ USCRIPT_TANGUT = 154,/* Tang */ /** @stable ICU 4.8 */ USCRIPT_WOLEAI = 155,/* Wole */ /** @stable ICU 49 */ USCRIPT_ANATOLIAN_HIEROGLYPHS = 156,/* Hluw */ /** @stable ICU 49 */ USCRIPT_KHOJKI = 157,/* Khoj */ /** @stable ICU 49 */ USCRIPT_TIRHUTA = 158,/* Tirh */ /** @stable ICU 52 */ USCRIPT_CAUCASIAN_ALBANIAN = 159,/* Aghb */ /** @stable ICU 52 */ USCRIPT_MAHAJANI = 160,/* Mahj */ /** @stable ICU 54 */ USCRIPT_AHOM = 161,/* Ahom */ /** @stable ICU 54 */ USCRIPT_HATRAN = 162,/* Hatr */ /** @stable ICU 54 */ USCRIPT_MODI = 163,/* Modi */ /** @stable ICU 54 */ USCRIPT_MULTANI = 164,/* Mult */ /** @stable ICU 54 */ USCRIPT_PAU_CIN_HAU = 165,/* Pauc */ /** @stable ICU 54 */ USCRIPT_SIDDHAM = 166,/* Sidd */ /** @stable ICU 58 */ USCRIPT_ADLAM = 167,/* Adlm */ /** @stable ICU 58 */ USCRIPT_BHAIKSUKI = 168,/* Bhks */ /** @stable ICU 58 */ USCRIPT_MARCHEN = 169,/* Marc */ /** @stable ICU 58 */ USCRIPT_NEWA = 170,/* Newa */ /** @stable ICU 58 */ USCRIPT_OSAGE = 171,/* Osge */ /** @stable ICU 58 */ USCRIPT_HAN_WITH_BOPOMOFO = 172,/* Hanb */ /** @stable ICU 58 */ USCRIPT_JAMO = 173,/* Jamo */ /** @stable ICU 58 */ USCRIPT_SYMBOLS_EMOJI = 174,/* Zsye */ /** @stable ICU 60 */ USCRIPT_MASARAM_GONDI = 175,/* Gonm */ /** @stable ICU 60 */ USCRIPT_SOYOMBO = 176,/* Soyo */ /** @stable ICU 60 */ USCRIPT_ZANABAZAR_SQUARE = 177,/* Zanb */ #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** @stable ICU 62 */ USCRIPT_DOGRA = 178,/* Dogr */ /** @stable ICU 62 */ USCRIPT_GUNJALA_GONDI = 179,/* Gong */ /** @stable ICU 62 */ USCRIPT_MAKASAR = 180,/* Maka */ /** @stable ICU 62 */ USCRIPT_MEDEFAIDRIN = 181,/* Medf */ /** @stable ICU 62 */ USCRIPT_HANIFI_ROHINGYA = 182,/* Rohg */ /** @stable ICU 62 */ USCRIPT_SOGDIAN = 183,/* Sogd */ /** @stable ICU 62 */ USCRIPT_OLD_SOGDIAN = 184,/* Sogo */ #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) #if (NTDDI_VERSION >= NTDDI_WIN10_VB) /** @stable ICU 64 */ USCRIPT_ELYMAIC = 185,/* Elym */ /** @stable ICU 64 */ USCRIPT_NYIAKENG_PUACHUE_HMONG = 186,/* Hmnp */ /** @stable ICU 64 */ USCRIPT_NANDINAGARI = 187,/* Nand */ /** @stable ICU 64 */ USCRIPT_WANCHO = 188,/* Wcho */ #endif // (NTDDI_VERSION >= NTDDI_WIN10_VB) #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** @stable ICU 66 */ USCRIPT_CHORASMIAN = 189,/* Chrs */ /** @stable ICU 66 */ USCRIPT_DIVES_AKURU = 190,/* Diak */ /** @stable ICU 66 */ USCRIPT_KHITAN_SMALL_SCRIPT = 191,/* Kits */ /** @stable ICU 66 */ USCRIPT_YEZIDI = 192,/* Yezi */ #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) } UScriptCode; /** * Gets the script codes associated with the given locale or ISO 15924 abbreviation or name. * Fills in USCRIPT_MALAYALAM given "Malayam" OR "Mlym". * Fills in USCRIPT_LATIN given "en" OR "en_US" * If the required capacity is greater than the capacity of the destination buffer, * then the error code is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned. * *

Note: To search by short or long script alias only, use * u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead. That does * a fast lookup with no access of the locale data. * * @param nameOrAbbrOrLocale name of the script, as given in * PropertyValueAliases.txt, or ISO 15924 code or locale * @param fillIn the UScriptCode buffer to fill in the script code * @param capacity the capacity (size) of UScriptCode buffer passed in. * @param err the error status code. * @return The number of script codes filled in the buffer passed in * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 uscript_getCode(const char* nameOrAbbrOrLocale,UScriptCode* fillIn,int32_t capacity,UErrorCode *err); /** * Returns the long Unicode script name, if there is one. * Otherwise returns the 4-letter ISO 15924 script code. * Returns "Malayam" given USCRIPT_MALAYALAM. * * @param scriptCode UScriptCode enum * @return long script name as given in PropertyValueAliases.txt, or the 4-letter code, * or NULL if scriptCode is invalid * @stable ICU 2.4 */ U_CAPI const char* U_EXPORT2 uscript_getName(UScriptCode scriptCode); /** * Returns the 4-letter ISO 15924 script code, * which is the same as the short Unicode script name if Unicode has names for the script. * Returns "Mlym" given USCRIPT_MALAYALAM. * * @param scriptCode UScriptCode enum * @return short script name (4-letter code), or NULL if scriptCode is invalid * @stable ICU 2.4 */ U_CAPI const char* U_EXPORT2 uscript_getShortName(UScriptCode scriptCode); /** * Gets the script code associated with the given codepoint. * Returns USCRIPT_MALAYALAM given 0x0D02 * @param codepoint UChar32 codepoint * @param err the error status code. * @return The UScriptCode, or 0 if codepoint is invalid * @stable ICU 2.4 */ U_CAPI UScriptCode U_EXPORT2 uscript_getScript(UChar32 codepoint, UErrorCode *err); /** * Do the Script_Extensions of code point c contain script sc? * If c does not have explicit Script_Extensions, then this tests whether * c has the Script property value sc. * * Some characters are commonly used in multiple scripts. * For more information, see UAX #24: http://www.unicode.org/reports/tr24/. * @param c code point * @param sc script code * @return true if sc is in Script_Extensions(c) * @stable ICU 49 */ U_CAPI UBool U_EXPORT2 uscript_hasScript(UChar32 c, UScriptCode sc); /** * Writes code point c's Script_Extensions as a list of UScriptCode values * to the output scripts array and returns the number of script codes. * - If c does have Script_Extensions, then the Script property value * (normally Common or Inherited) is not included. * - If c does not have Script_Extensions, then the one Script code is written to the output array. * - If c is not a valid code point, then the one USCRIPT_UNKNOWN code is written. * In other words, if the return value is 1, * then the output array contains exactly c's single Script code. * If the return value is n>=2, then the output array contains c's n Script_Extensions script codes. * * Some characters are commonly used in multiple scripts. * For more information, see UAX #24: http://www.unicode.org/reports/tr24/. * * If there are more than capacity script codes to be written, then * U_BUFFER_OVERFLOW_ERROR is set and the number of Script_Extensions is returned. * (Usual ICU buffer handling behavior.) * * @param c code point * @param scripts output script code array * @param capacity capacity of the scripts array * @param errorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return number of script codes in c's Script_Extensions, or 1 for the single Script value, * written to scripts unless U_BUFFER_OVERFLOW_ERROR indicates insufficient capacity * @stable ICU 49 */ U_CAPI int32_t U_EXPORT2 uscript_getScriptExtensions(UChar32 c, UScriptCode *scripts, int32_t capacity, UErrorCode *errorCode); /** * Script usage constants. * See UAX #31 Unicode Identifier and Pattern Syntax. * http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers * * @stable ICU 51 */ typedef enum UScriptUsage { /** Not encoded in Unicode. @stable ICU 51 */ USCRIPT_USAGE_NOT_ENCODED, /** Unknown script usage. @stable ICU 51 */ USCRIPT_USAGE_UNKNOWN, /** Candidate for Exclusion from Identifiers. @stable ICU 51 */ USCRIPT_USAGE_EXCLUDED, /** Limited Use script. @stable ICU 51 */ USCRIPT_USAGE_LIMITED_USE, /** Aspirational Use script. @stable ICU 51 */ USCRIPT_USAGE_ASPIRATIONAL, /** Recommended script. @stable ICU 51 */ USCRIPT_USAGE_RECOMMENDED } UScriptUsage; /** * Writes the script sample character string. * This string normally consists of one code point but might be longer. * The string is empty if the script is not encoded. * * @param script script code * @param dest output string array * @param capacity number of UChars in the dest array * @param pErrorCode standard ICU in/out error code, must pass U_SUCCESS() on input * @return the string length, even if U_BUFFER_OVERFLOW_ERROR * @stable ICU 51 */ U_CAPI int32_t U_EXPORT2 uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode); /** * Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax. * Returns USCRIPT_USAGE_NOT_ENCODED if the script is not encoded in Unicode. * * @param script script code * @return script usage * @see UScriptUsage * @stable ICU 51 */ U_CAPI UScriptUsage U_EXPORT2 uscript_getUsage(UScriptCode script); /** * Returns true if the script is written right-to-left. * For example, Arab and Hebr. * * @param script script code * @return true if the script is right-to-left * @stable ICU 51 */ U_CAPI UBool U_EXPORT2 uscript_isRightToLeft(UScriptCode script); /** * Returns true if the script allows line breaks between letters (excluding hyphenation). * Such a script typically requires dictionary-based line breaking. * For example, Hani and Thai. * * @param script script code * @return true if the script allows line breaks between letters * @stable ICU 51 */ U_CAPI UBool U_EXPORT2 uscript_breaksBetweenLetters(UScriptCode script); /** * Returns true if in modern (or most recent) usage of the script case distinctions are customary. * For example, Latn and Cyrl. * * @param script script code * @return true if the script is cased * @stable ICU 51 */ U_CAPI UBool U_EXPORT2 uscript_isCased(UScriptCode script); #endif // urep.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 1997-2010, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * Date Name Description * 06/23/00 aliu Creation. ****************************************************************************** */ #ifndef __UREP_H #define __UREP_H U_CDECL_BEGIN /******************************************************************** * General Notes ******************************************************************** * TODO * Add usage scenario * Add test code * Talk about pinning * Talk about "can truncate result if out of memory" */ /******************************************************************** * Data Structures ********************************************************************/ /** * \file * \brief C API: Callbacks for UReplaceable */ /** * An opaque replaceable text object. This will be manipulated only * through the caller-supplied UReplaceableFunctor struct. Related * to the C++ class Replaceable. * This is currently only used in the Transliterator C API, see utrans.h . * @stable ICU 2.0 */ typedef void* UReplaceable; /** * A set of function pointers that transliterators use to manipulate a * UReplaceable. The caller should supply the required functions to * manipulate their text appropriately. Related to the C++ class * Replaceable. * @stable ICU 2.0 */ typedef struct UReplaceableCallbacks { /** * Function pointer that returns the number of UChar code units in * this text. * * @param rep A pointer to "this" UReplaceable object. * @return The length of the text. * @stable ICU 2.0 */ int32_t (*length)(const UReplaceable* rep); /** * Function pointer that returns a UChar code units at the given * offset into this text; 0 <= offset < n, where n is the value * returned by (*length)(rep). See unistr.h for a description of * charAt() vs. char32At(). * * @param rep A pointer to "this" UReplaceable object. * @param offset The index at which to fetch the UChar (code unit). * @return The UChar (code unit) at offset, or U+FFFF if the offset is out of bounds. * @stable ICU 2.0 */ UChar (*charAt)(const UReplaceable* rep, int32_t offset); /** * Function pointer that returns a UChar32 code point at the given * offset into this text. See unistr.h for a description of * charAt() vs. char32At(). * * @param rep A pointer to "this" UReplaceable object. * @param offset The index at which to fetch the UChar32 (code point). * @return The UChar32 (code point) at offset, or U+FFFF if the offset is out of bounds. * @stable ICU 2.0 */ UChar32 (*char32At)(const UReplaceable* rep, int32_t offset); /** * Function pointer that replaces text between start and limit in * this text with the given text. Attributes (out of band info) * should be retained. * * @param rep A pointer to "this" UReplaceable object. * @param start the starting index of the text to be replaced, * inclusive. * @param limit the ending index of the text to be replaced, * exclusive. * @param text the new text to replace the UChars from * start..limit-1. * @param textLength the number of UChars at text, or -1 if text * is null-terminated. * @stable ICU 2.0 */ void (*replace)(UReplaceable* rep, int32_t start, int32_t limit, const UChar* text, int32_t textLength); /** * Function pointer that copies the characters in the range * [start, limit) into the array dst. * * @param rep A pointer to "this" UReplaceable object. * @param start offset of first character which will be copied * into the array * @param limit offset immediately following the last character to * be copied * @param dst array in which to copy characters. The length of * dst must be at least (limit - start). * @stable ICU 2.1 */ void (*extract)(UReplaceable* rep, int32_t start, int32_t limit, UChar* dst); /** * Function pointer that copies text between start and limit in * this text to another index in the text. Attributes (out of * band info) should be retained. After this call, there will be * (at least) two copies of the characters originally located at * start..limit-1. * * @param rep A pointer to "this" UReplaceable object. * @param start the starting index of the text to be copied, * inclusive. * @param limit the ending index of the text to be copied, * exclusive. * @param dest the index at which the copy of the UChars should be * inserted. * @stable ICU 2.0 */ void (*copy)(UReplaceable* rep, int32_t start, int32_t limit, int32_t dest); } UReplaceableCallbacks; U_CDECL_END #endif // umisc.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2006, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: umisc.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 1999oct15 * created by: Markus W. Scherer */ #ifndef UMISC_H #define UMISC_H /** * \file * \brief C API: Miscellaneous definitions * * This file contains miscellaneous definitions for the C APIs. */ U_CDECL_BEGIN /** A struct representing a range of text containing a specific field * @stable ICU 2.0 */ typedef struct UFieldPosition { /** * The field * @stable ICU 2.0 */ int32_t field; /** * The start of the text range containing field * @stable ICU 2.0 */ int32_t beginIndex; /** * The limit of the text range containing field * @stable ICU 2.0 */ int32_t endIndex; } UFieldPosition; #if !UCONFIG_NO_SERVICE /** * Opaque type returned by registerInstance, registerFactory and unregister for service registration. * @stable ICU 2.6 */ typedef const void* URegistryKey; #endif U_CDECL_END #endif // uiter.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2002-2011 International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uiter.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2002jan18 * created by: Markus W. Scherer */ #ifndef __UITER_H__ #define __UITER_H__ /** * \file * \brief C API: Unicode Character Iteration * * @see UCharIterator */ U_CDECL_BEGIN struct UCharIterator; typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */ /** * Origin constants for UCharIterator.getIndex() and UCharIterator.move(). * @see UCharIteratorMove * @see UCharIterator * @stable ICU 2.1 */ typedef enum UCharIteratorOrigin { UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH } UCharIteratorOrigin; /** Constants for UCharIterator. @stable ICU 2.6 */ enum { /** * Constant value that may be returned by UCharIteratorMove * indicating that the final UTF-16 index is not known, but that the move succeeded. * This can occur when moving relative to limit or length, or * when moving relative to the current index after a setState() * when the current UTF-16 index is not known. * * It would be very inefficient to have to count from the beginning of the text * just to get the current/limit/length index after moving relative to it. * The actual index can be determined with getIndex(UITER_CURRENT) * which will count the UChars if necessary. * * @stable ICU 2.6 */ UITER_UNKNOWN_INDEX=-2 }; /** * Constant for UCharIterator getState() indicating an error or * an unknown state. * Returned by uiter_getState()/UCharIteratorGetState * when an error occurs. * Also, some UCharIterator implementations may not be able to return * a valid state for each position. This will be clearly documented * for each such iterator (none of the public ones here). * * @stable ICU 2.6 */ #define UITER_NO_STATE ((uint32_t)0xffffffff) /** * Function type declaration for UCharIterator.getIndex(). * * Gets the current position, or the start or limit of the * iteration range. * * This function may perform slowly for UITER_CURRENT after setState() was called, * or for UITER_LENGTH, because an iterator implementation may have to count * UChars if the underlying storage is not UTF-16. * * @param iter the UCharIterator structure ("this pointer") * @param origin get the 0, start, limit, length, or current index * @return the requested index, or U_SENTINEL in an error condition * * @see UCharIteratorOrigin * @see UCharIterator * @stable ICU 2.1 */ typedef int32_t U_CALLCONV UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin); /** * Function type declaration for UCharIterator.move(). * * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index). * * Moves the current position relative to the start or limit of the * iteration range, or relative to the current position itself. * The movement is expressed in numbers of code units forward * or backward by specifying a positive or negative delta. * Out of bounds movement will be pinned to the start or limit. * * This function may perform slowly for moving relative to UITER_LENGTH * because an iterator implementation may have to count the rest of the * UChars if the native storage is not UTF-16. * * When moving relative to the limit or length, or * relative to the current position after setState() was called, * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient * determination of the actual UTF-16 index. * The actual index can be determined with getIndex(UITER_CURRENT) * which will count the UChars if necessary. * See UITER_UNKNOWN_INDEX for details. * * @param iter the UCharIterator structure ("this pointer") * @param delta can be positive, zero, or negative * @param origin move relative to the 0, start, limit, length, or current index * @return the new index, or U_SENTINEL on an error condition, * or UITER_UNKNOWN_INDEX when the index is not known. * * @see UCharIteratorOrigin * @see UCharIterator * @see UITER_UNKNOWN_INDEX * @stable ICU 2.1 */ typedef int32_t U_CALLCONV UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin); /** * Function type declaration for UCharIterator.hasNext(). * * Check if current() and next() can still * return another code unit. * * @param iter the UCharIterator structure ("this pointer") * @return boolean value for whether current() and next() can still return another code unit * * @see UCharIterator * @stable ICU 2.1 */ typedef UBool U_CALLCONV UCharIteratorHasNext(UCharIterator *iter); /** * Function type declaration for UCharIterator.hasPrevious(). * * Check if previous() can still return another code unit. * * @param iter the UCharIterator structure ("this pointer") * @return boolean value for whether previous() can still return another code unit * * @see UCharIterator * @stable ICU 2.1 */ typedef UBool U_CALLCONV UCharIteratorHasPrevious(UCharIterator *iter); /** * Function type declaration for UCharIterator.current(). * * Return the code unit at the current position, * or U_SENTINEL if there is none (index is at the limit). * * @param iter the UCharIterator structure ("this pointer") * @return the current code unit * * @see UCharIterator * @stable ICU 2.1 */ typedef UChar32 U_CALLCONV UCharIteratorCurrent(UCharIterator *iter); /** * Function type declaration for UCharIterator.next(). * * Return the code unit at the current index and increment * the index (post-increment, like s[i++]), * or return U_SENTINEL if there is none (index is at the limit). * * @param iter the UCharIterator structure ("this pointer") * @return the current code unit (and post-increment the current index) * * @see UCharIterator * @stable ICU 2.1 */ typedef UChar32 U_CALLCONV UCharIteratorNext(UCharIterator *iter); /** * Function type declaration for UCharIterator.previous(). * * Decrement the index and return the code unit from there * (pre-decrement, like s[--i]), * or return U_SENTINEL if there is none (index is at the start). * * @param iter the UCharIterator structure ("this pointer") * @return the previous code unit (after pre-decrementing the current index) * * @see UCharIterator * @stable ICU 2.1 */ typedef UChar32 U_CALLCONV UCharIteratorPrevious(UCharIterator *iter); /** * Function type declaration for UCharIterator.reservedFn(). * Reserved for future use. * * @param iter the UCharIterator structure ("this pointer") * @param something some integer argument * @return some integer * * @see UCharIterator * @stable ICU 2.1 */ typedef int32_t U_CALLCONV UCharIteratorReserved(UCharIterator *iter, int32_t something); /** * Function type declaration for UCharIterator.getState(). * * Get the "state" of the iterator in the form of a single 32-bit word. * It is recommended that the state value be calculated to be as small as * is feasible. For strings with limited lengths, fewer than 32 bits may * be sufficient. * * This is used together with setState()/UCharIteratorSetState * to save and restore the iterator position more efficiently than with * getIndex()/move(). * * The iterator state is defined as a uint32_t value because it is designed * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state * of the character iterator. * * With some UCharIterator implementations (e.g., UTF-8), * getting and setting the UTF-16 index with existing functions * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but * relatively slow because the iterator has to "walk" from a known index * to the requested one. * This takes more time the farther it needs to go. * * An opaque state value allows an iterator implementation to provide * an internal index (UTF-8: the source byte array index) for * fast, constant-time restoration. * * After calling setState(), a getIndex(UITER_CURRENT) may be slow because * the UTF-16 index may not be restored as well, but the iterator can deliver * the correct text contents and move relative to the current position * without performance degradation. * * Some UCharIterator implementations may not be able to return * a valid state for each position, in which case they return UITER_NO_STATE instead. * This will be clearly documented for each such iterator (none of the public ones here). * * @param iter the UCharIterator structure ("this pointer") * @return the state word * * @see UCharIterator * @see UCharIteratorSetState * @see UITER_NO_STATE * @stable ICU 2.6 */ typedef uint32_t U_CALLCONV UCharIteratorGetState(const UCharIterator *iter); /** * Function type declaration for UCharIterator.setState(). * * Restore the "state" of the iterator using a state word from a getState() call. * The iterator object need not be the same one as for which getState() was called, * but it must be of the same type (set up using the same uiter_setXYZ function) * and it must iterate over the same string * (binary identical regardless of memory address). * For more about the state word see UCharIteratorGetState. * * After calling setState(), a getIndex(UITER_CURRENT) may be slow because * the UTF-16 index may not be restored as well, but the iterator can deliver * the correct text contents and move relative to the current position * without performance degradation. * * @param iter the UCharIterator structure ("this pointer") * @param state the state word from a getState() call * on a same-type, same-string iterator * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see UCharIterator * @see UCharIteratorGetState * @stable ICU 2.6 */ typedef void U_CALLCONV UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); /** * C API for code unit iteration. * This can be used as a C wrapper around * CharacterIterator, Replaceable, or implemented using simple strings, etc. * * There are two roles for using UCharIterator: * * A "provider" sets the necessary function pointers and controls the "protected" * fields of the UCharIterator structure. A "provider" passes a UCharIterator * into C APIs that need a UCharIterator as an abstract, flexible string interface. * * Implementations of such C APIs are "callers" of UCharIterator functions; * they only use the "public" function pointers and never access the "protected" * fields directly. * * The current() and next() functions only check the current index against the * limit, and previous() only checks the current index against the start, * to see if the iterator already reached the end of the iteration range. * * The assumption - in all iterators - is that the index is moved via the API, * which means it won't go out of bounds, or the index is modified by * user code that knows enough about the iterator implementation to set valid * index values. * * UCharIterator functions return code unit values 0..0xffff, * or U_SENTINEL if the iteration bounds are reached. * * @stable ICU 2.1 */ struct UCharIterator { /** * (protected) Pointer to string or wrapped object or similar. * Not used by caller. * @stable ICU 2.1 */ const void *context; /** * (protected) Length of string or similar. * Not used by caller. * @stable ICU 2.1 */ int32_t length; /** * (protected) Start index or similar. * Not used by caller. * @stable ICU 2.1 */ int32_t start; /** * (protected) Current index or similar. * Not used by caller. * @stable ICU 2.1 */ int32_t index; /** * (protected) Limit index or similar. * Not used by caller. * @stable ICU 2.1 */ int32_t limit; /** * (protected) Used by UTF-8 iterators and possibly others. * @stable ICU 2.1 */ int32_t reservedField; /** * (public) Returns the current position or the * start or limit index of the iteration range. * * @see UCharIteratorGetIndex * @stable ICU 2.1 */ UCharIteratorGetIndex *getIndex; /** * (public) Moves the current position relative to the start or limit of the * iteration range, or relative to the current position itself. * The movement is expressed in numbers of code units forward * or backward by specifying a positive or negative delta. * * @see UCharIteratorMove * @stable ICU 2.1 */ UCharIteratorMove *move; /** * (public) Check if current() and next() can still * return another code unit. * * @see UCharIteratorHasNext * @stable ICU 2.1 */ UCharIteratorHasNext *hasNext; /** * (public) Check if previous() can still return another code unit. * * @see UCharIteratorHasPrevious * @stable ICU 2.1 */ UCharIteratorHasPrevious *hasPrevious; /** * (public) Return the code unit at the current position, * or U_SENTINEL if there is none (index is at the limit). * * @see UCharIteratorCurrent * @stable ICU 2.1 */ UCharIteratorCurrent *current; /** * (public) Return the code unit at the current index and increment * the index (post-increment, like s[i++]), * or return U_SENTINEL if there is none (index is at the limit). * * @see UCharIteratorNext * @stable ICU 2.1 */ UCharIteratorNext *next; /** * (public) Decrement the index and return the code unit from there * (pre-decrement, like s[--i]), * or return U_SENTINEL if there is none (index is at the start). * * @see UCharIteratorPrevious * @stable ICU 2.1 */ UCharIteratorPrevious *previous; /** * (public) Reserved for future use. Currently NULL. * * @see UCharIteratorReserved * @stable ICU 2.1 */ UCharIteratorReserved *reservedFn; /** * (public) Return the state of the iterator, to be restored later with setState(). * This function pointer is NULL if the iterator does not implement it. * * @see UCharIteratorGet * @stable ICU 2.6 */ UCharIteratorGetState *getState; /** * (public) Restore the iterator state from the state word from a call * to getState(). * This function pointer is NULL if the iterator does not implement it. * * @see UCharIteratorSet * @stable ICU 2.6 */ UCharIteratorSetState *setState; }; /** * Helper function for UCharIterator to get the code point * at the current index. * * Return the code point that includes the code unit at the current position, * or U_SENTINEL if there is none (index is at the limit). * If the current code unit is a lead or trail surrogate, * then the following or preceding surrogate is used to form * the code point value. * * @param iter the UCharIterator structure ("this pointer") * @return the current code point * * @see UCharIterator * @see U16_GET * @see UnicodeString::char32At() * @stable ICU 2.1 */ U_CAPI UChar32 U_EXPORT2 uiter_current32(UCharIterator *iter); /** * Helper function for UCharIterator to get the next code point. * * Return the code point at the current index and increment * the index (post-increment, like s[i++]), * or return U_SENTINEL if there is none (index is at the limit). * * @param iter the UCharIterator structure ("this pointer") * @return the current code point (and post-increment the current index) * * @see UCharIterator * @see U16_NEXT * @stable ICU 2.1 */ U_CAPI UChar32 U_EXPORT2 uiter_next32(UCharIterator *iter); /** * Helper function for UCharIterator to get the previous code point. * * Decrement the index and return the code point from there * (pre-decrement, like s[--i]), * or return U_SENTINEL if there is none (index is at the start). * * @param iter the UCharIterator structure ("this pointer") * @return the previous code point (after pre-decrementing the current index) * * @see UCharIterator * @see U16_PREV * @stable ICU 2.1 */ U_CAPI UChar32 U_EXPORT2 uiter_previous32(UCharIterator *iter); /** * Get the "state" of the iterator in the form of a single 32-bit word. * This is a convenience function that calls iter->getState(iter) * if iter->getState is not NULL; * if it is NULL or any other error occurs, then UITER_NO_STATE is returned. * * Some UCharIterator implementations may not be able to return * a valid state for each position, in which case they return UITER_NO_STATE instead. * This will be clearly documented for each such iterator (none of the public ones here). * * @param iter the UCharIterator structure ("this pointer") * @return the state word * * @see UCharIterator * @see UCharIteratorGetState * @see UITER_NO_STATE * @stable ICU 2.6 */ U_CAPI uint32_t U_EXPORT2 uiter_getState(const UCharIterator *iter); /** * Restore the "state" of the iterator using a state word from a getState() call. * This is a convenience function that calls iter->setState(iter, state, pErrorCode) * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set. * * @param iter the UCharIterator structure ("this pointer") * @param state the state word from a getState() call * on a same-type, same-string iterator * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see UCharIterator * @see UCharIteratorSetState * @stable ICU 2.6 */ U_CAPI void U_EXPORT2 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); /** * Set up a UCharIterator to iterate over a string. * * Sets the UCharIterator function pointers for iteration over the string s * with iteration boundaries start=index=0 and length=limit=string length. * The "provider" may set the start, index, and limit values at any time * within the range 0..length. * The length field will be ignored. * * The string pointer s is set into UCharIterator.context without copying * or reallocating the string contents. * * getState() simply returns the current index. * move() will always return the final index. * * @param iter UCharIterator structure to be set for iteration * @param s String to iterate over * @param length Length of s, or -1 if NUL-terminated * * @see UCharIterator * @stable ICU 2.1 */ U_CAPI void U_EXPORT2 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length); /** * Set up a UCharIterator to iterate over a UTF-16BE string * (byte vector with a big-endian pair of bytes per UChar). * * Everything works just like with a normal UChar iterator (uiter_setString), * except that UChars are assembled from byte pairs, * and that the length argument here indicates an even number of bytes. * * getState() simply returns the current index. * move() will always return the final index. * * @param iter UCharIterator structure to be set for iteration * @param s UTF-16BE string to iterate over * @param length Length of s as an even number of bytes, or -1 if NUL-terminated * (NUL means pair of 0 bytes at even index from s) * * @see UCharIterator * @see uiter_setString * @stable ICU 2.6 */ U_CAPI void U_EXPORT2 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length); /** * Set up a UCharIterator to iterate over a UTF-8 string. * * Sets the UCharIterator function pointers for iteration over the UTF-8 string s * with UTF-8 iteration boundaries 0 and length. * The implementation counts the UTF-16 index on the fly and * lazily evaluates the UTF-16 length of the text. * * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length. * When the reservedField is not 0, then it contains a supplementary code point * and the UTF-16 index is between the two corresponding surrogates. * At that point, the UTF-8 index is behind that code point. * * The UTF-8 string pointer s is set into UCharIterator.context without copying * or reallocating the string contents. * * getState() returns a state value consisting of * - the current UTF-8 source byte index (bits 31..1) * - a flag (bit 0) that indicates whether the UChar position is in the middle * of a surrogate pair * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point) * * getState() cannot also encode the UTF-16 index in the state value. * move(relative to limit or length), or * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX. * * @param iter UCharIterator structure to be set for iteration * @param s UTF-8 string to iterate over * @param length Length of s in bytes, or -1 if NUL-terminated * * @see UCharIterator * @stable ICU 2.6 */ U_CAPI void U_EXPORT2 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length); U_CDECL_END #endif // uenum.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2002-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uenum.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:2 * * created on: 2002jul08 * created by: Vladimir Weinstein */ #ifndef __UENUM_H #define __UENUM_H /** * \file * \brief C API: String Enumeration */ /** * An enumeration object. * For usage in C programs. * @stable ICU 2.2 */ struct UEnumeration; /** structure representing an enumeration object instance @stable ICU 2.2 */ typedef struct UEnumeration UEnumeration; /** * Disposes of resources in use by the iterator. If en is NULL, * does nothing. After this call, any char* or UChar* pointer * returned by uenum_unext() or uenum_next() is invalid. * @param en UEnumeration structure pointer * @stable ICU 2.2 */ U_CAPI void U_EXPORT2 uenum_close(UEnumeration* en); /** * Returns the number of elements that the iterator traverses. If * the iterator is out-of-sync with its service, status is set to * U_ENUM_OUT_OF_SYNC_ERROR. * This is a convenience function. It can end up being very * expensive as all the items might have to be pre-fetched (depending * on the type of data being traversed). Use with caution and only * when necessary. * @param en UEnumeration structure pointer * @param status error code, can be U_ENUM_OUT_OF_SYNC_ERROR if the * iterator is out of sync. * @return number of elements in the iterator * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 uenum_count(UEnumeration* en, UErrorCode* status); /** * Returns the next element in the iterator's list. If there are * no more elements, returns NULL. If the iterator is out-of-sync * with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and * NULL is returned. If the native service string is a char* string, * it is converted to UChar* with the invariant converter. * The result is terminated by (UChar)0. * @param en the iterator object * @param resultLength pointer to receive the length of the result * (not including the terminating \\0). * If the pointer is NULL it is ignored. * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if * the iterator is out of sync with its service. * @return a pointer to the string. The string will be * zero-terminated. The return pointer is owned by this iterator * and must not be deleted by the caller. The pointer is valid * until the next call to any uenum_... method, including * uenum_next() or uenum_unext(). When all strings have been * traversed, returns NULL. * @stable ICU 2.2 */ U_CAPI const UChar* U_EXPORT2 uenum_unext(UEnumeration* en, int32_t* resultLength, UErrorCode* status); /** * Returns the next element in the iterator's list. If there are * no more elements, returns NULL. If the iterator is out-of-sync * with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and * NULL is returned. If the native service string is a UChar* * string, it is converted to char* with the invariant converter. * The result is terminated by (char)0. If the conversion fails * (because a character cannot be converted) then status is set to * U_INVARIANT_CONVERSION_ERROR and the return value is undefined * (but non-NULL). * @param en the iterator object * @param resultLength pointer to receive the length of the result * (not including the terminating \\0). * If the pointer is NULL it is ignored. * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if * the iterator is out of sync with its service. Set to * U_INVARIANT_CONVERSION_ERROR if the underlying native string is * UChar* and conversion to char* with the invariant converter * fails. This error pertains only to current string, so iteration * might be able to continue successfully. * @return a pointer to the string. The string will be * zero-terminated. The return pointer is owned by this iterator * and must not be deleted by the caller. The pointer is valid * until the next call to any uenum_... method, including * uenum_next() or uenum_unext(). When all strings have been * traversed, returns NULL. * @stable ICU 2.2 */ U_CAPI const char* U_EXPORT2 uenum_next(UEnumeration* en, int32_t* resultLength, UErrorCode* status); /** * Resets the iterator to the current list of service IDs. This * re-establishes sync with the service and rewinds the iterator * to start at the first element. * @param en the iterator object * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if * the iterator is out of sync with its service. * @stable ICU 2.2 */ U_CAPI void U_EXPORT2 uenum_reset(UEnumeration* en, UErrorCode* status); /** * Given an array of const UChar* strings, return a UEnumeration. String pointers from 0..count-1 must not be null. * Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close. * \snippet test/cintltst/uenumtst.c uenum_openUCharStringsEnumeration * @param strings array of const UChar* strings (each null terminated). All storage is owned by the caller. * @param count length of the array * @param ec error code * @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory. * @see uenum_close * @stable ICU 50 */ U_CAPI UEnumeration* U_EXPORT2 uenum_openUCharStringsEnumeration(const UChar* const strings[], int32_t count, UErrorCode* ec); /** * Given an array of const char* strings (invariant chars only), return a UEnumeration. String pointers from 0..count-1 must not be null. * Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close. * \snippet test/cintltst/uenumtst.c uenum_openCharStringsEnumeration * @param strings array of char* strings (each null terminated). All storage is owned by the caller. * @param count length of the array * @param ec error code * @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory * @see uenum_close * @stable ICU 50 */ U_CAPI UEnumeration* U_EXPORT2 uenum_openCharStringsEnumeration(const char* const strings[], int32_t count, UErrorCode* ec); #endif // uloc.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File ULOC.H * * Modification History: * * Date Name Description * 04/01/97 aliu Creation. * 08/22/98 stephen JDK 1.2 sync. * 12/08/98 rtg New C API for Locale * 03/30/99 damiba overhaul * 03/31/99 helena Javadoc for uloc functions. * 04/15/99 Madhu Updated Javadoc ******************************************************************************** */ #ifndef ULOC_H #define ULOC_H /** * \file * \brief C API: Locale ID functionality similar to C++ class Locale * *

ULoc C API for Locale

* A Locale represents a specific geographical, political, * or cultural region. An operation that requires a Locale to perform * its task is called locale-sensitive and uses the Locale * to tailor information for the user. For example, displaying a number * is a locale-sensitive operation--the number should be formatted * according to the customs/conventions of the user's native country, * region, or culture. In the C APIs, a locales is simply a const char string. * *

* You create a Locale with one of the three options listed below. * Each of the component is separated by '_' in the locale string. * \htmlonly

\endhtmlonly *
 * \code
 *       newLanguage
 * 
 *       newLanguage + newCountry
 * 
 *       newLanguage + newCountry + newVariant
 * \endcode
 * 
* \htmlonly
\endhtmlonly * The first option is a valid ISO * Language Code. These codes are the lower-case two-letter * codes as defined by ISO-639. * You can find a full list of these codes at a number of sites, such as: *
* http://www.ics.uci.edu/pub/ietf/http/related/iso639.txt * *

* The second option includes an additional ISO Country * Code. These codes are the upper-case two-letter codes * as defined by ISO-3166. * You can find a full list of these codes at a number of sites, such as: *
* http://www.chemie.fu-berlin.de/diverse/doc/ISO_3166.html * *

* The third option requires another additional information--the * Variant. * The Variant codes are vendor and browser-specific. * For example, use WIN for Windows, MAC for Macintosh, and POSIX for POSIX. * Where there are two variants, separate them with an underscore, and * put the most important one first. For * example, a Traditional Spanish collation might be referenced, with * "ES", "ES", "Traditional_WIN". * *

* Because a Locale is just an identifier for a region, * no validity check is performed when you specify a Locale. * If you want to see whether particular resources are available for the * Locale you asked for, you must query those resources. For * example, ask the UNumberFormat for the locales it supports * using its getAvailable method. *
Note: When you ask for a resource for a particular * locale, you get back the best available match, not necessarily * precisely what you asked for. For more information, look at * UResourceBundle. * *

* The Locale provides a number of convenient constants * that you can use to specify the commonly used * locales. For example, the following refers to a locale * for the United States: * \htmlonly

\endhtmlonly *
 * \code
 *       ULOC_US
 * \endcode
 * 
* \htmlonly
\endhtmlonly * *

* Once you've specified a locale you can query it for information about * itself. Use uloc_getCountry to get the ISO Country Code and * uloc_getLanguage to get the ISO Language Code. You can * use uloc_getDisplayCountry to get the * name of the country suitable for displaying to the user. Similarly, * you can use uloc_getDisplayLanguage to get the name of * the language suitable for displaying to the user. Interestingly, * the uloc_getDisplayXXX methods are themselves locale-sensitive * and have two versions: one that uses the default locale and one * that takes a locale as an argument and displays the name or country in * a language appropriate to that locale. * *

* The ICU provides a number of services that perform locale-sensitive * operations. For example, the unum_xxx functions format * numbers, currency, or percentages in a locale-sensitive manner. *

* \htmlonly
\endhtmlonly *
 * \code
 *     UErrorCode success = U_ZERO_ERROR;
 *     UNumberFormat *nf;
 *     const char* myLocale = "fr_FR";
 * 
 *     nf = unum_open( UNUM_DEFAULT, NULL, success );          
 *     unum_close(nf);
 *     nf = unum_open( UNUM_CURRENCY, NULL, success );
 *     unum_close(nf);
 *     nf = unum_open( UNUM_PERCENT, NULL, success );   
 *     unum_close(nf);
 * \endcode
 * 
* \htmlonly
\endhtmlonly * Each of these methods has two variants; one with an explicit locale * and one without; the latter using the default locale. * \htmlonly
\endhtmlonly *
 * \code 
 * 
 *     nf = unum_open( UNUM_DEFAULT, myLocale, success );          
 *     unum_close(nf);
 *     nf = unum_open( UNUM_CURRENCY, myLocale, success );
 *     unum_close(nf);
 *     nf = unum_open( UNUM_PERCENT, myLocale, success );   
 *     unum_close(nf);
 * \endcode
 * 
* \htmlonly
\endhtmlonly * A Locale is the mechanism for identifying the kind of services * (UNumberFormat) that you would like to get. The locale is * just a mechanism for identifying these services. * *

* Each international service that performs locale-sensitive operations * allows you * to get all the available objects of that type. You can sift * through these objects by language, country, or variant, * and use the display names to present a menu to the user. * For example, you can create a menu of all the collation objects * suitable for a given language. Such classes implement these * three class methods: * \htmlonly

\endhtmlonly *
 * \code
 *       const char* uloc_getAvailable(int32_t index);
 *       int32_t uloc_countAvailable();
 *       int32_t
 *       uloc_getDisplayName(const char* localeID,
 *                 const char* inLocaleID, 
 *                 UChar* result,
 *                 int32_t maxResultSize,
 *                  UErrorCode* err);
 * 
 * \endcode
 * 
* \htmlonly
\endhtmlonly *

* Concerning POSIX/RFC1766 Locale IDs, * the getLanguage/getCountry/getVariant/getName functions do understand * the POSIX type form of language_COUNTRY.ENCODING\@VARIANT * and if there is not an ICU-stype variant, uloc_getVariant() for example * will return the one listed after the \@at sign. As well, the hyphen * "-" is recognized as a country/variant separator similarly to RFC1766. * So for example, "en-us" will be interpreted as en_US. * As a result, uloc_getName() is far from a no-op, and will have the * effect of converting POSIX/RFC1766 IDs into ICU form, although it does * NOT map any of the actual codes (i.e. russian->ru) in any way. * Applications should call uloc_getName() at the point where a locale ID * is coming from an external source (user entry, OS, web browser) * and pass the resulting string to other ICU functions. For example, * don't use de-de\@EURO as an argument to resourcebundle. * * @see UResourceBundle */ /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_CHINESE "zh" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_ENGLISH "en" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_FRENCH "fr" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_GERMAN "de" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_ITALIAN "it" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_JAPANESE "ja" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_KOREAN "ko" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_SIMPLIFIED_CHINESE "zh_CN" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_TRADITIONAL_CHINESE "zh_TW" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_CANADA "en_CA" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_CANADA_FRENCH "fr_CA" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_CHINA "zh_CN" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_PRC "zh_CN" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_FRANCE "fr_FR" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_GERMANY "de_DE" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_ITALY "it_IT" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_JAPAN "ja_JP" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_KOREA "ko_KR" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_TAIWAN "zh_TW" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_UK "en_GB" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_US "en_US" /** * Useful constant for the maximum size of the language part of a locale ID. * (including the terminating NULL). * @stable ICU 2.0 */ #define ULOC_LANG_CAPACITY 12 /** * Useful constant for the maximum size of the country part of a locale ID * (including the terminating NULL). * @stable ICU 2.0 */ #define ULOC_COUNTRY_CAPACITY 4 /** * Useful constant for the maximum size of the whole locale ID * (including the terminating NULL and all keywords). * @stable ICU 2.0 */ #define ULOC_FULLNAME_CAPACITY 157 /** * Useful constant for the maximum size of the script part of a locale ID * (including the terminating NULL). * @stable ICU 2.8 */ #define ULOC_SCRIPT_CAPACITY 6 /** * Useful constant for the maximum size of keywords in a locale * @stable ICU 2.8 */ #define ULOC_KEYWORDS_CAPACITY 96 /** * Useful constant for the maximum total size of keywords and their values in a locale * @stable ICU 2.8 */ #define ULOC_KEYWORD_AND_VALUES_CAPACITY 100 /** * Invariant character separating keywords from the locale string * @stable ICU 2.8 */ #define ULOC_KEYWORD_SEPARATOR '@' /** * Unicode code point for '@' separating keywords from the locale string. * @see ULOC_KEYWORD_SEPARATOR * @stable ICU 4.6 */ #define ULOC_KEYWORD_SEPARATOR_UNICODE 0x40 /** * Invariant character for assigning value to a keyword * @stable ICU 2.8 */ #define ULOC_KEYWORD_ASSIGN '=' /** * Unicode code point for '=' for assigning value to a keyword. * @see ULOC_KEYWORD_ASSIGN * @stable ICU 4.6 */ #define ULOC_KEYWORD_ASSIGN_UNICODE 0x3D /** * Invariant character separating keywords * @stable ICU 2.8 */ #define ULOC_KEYWORD_ITEM_SEPARATOR ';' /** * Unicode code point for ';' separating keywords * @see ULOC_KEYWORD_ITEM_SEPARATOR * @stable ICU 4.6 */ #define ULOC_KEYWORD_ITEM_SEPARATOR_UNICODE 0x3B /** * Constants for *_getLocale() * Allow user to select whether she wants information on * requested, valid or actual locale. * For example, a collator for "en_US_CALIFORNIA" was * requested. In the current state of ICU (2.0), * the requested locale is "en_US_CALIFORNIA", * the valid locale is "en_US" (most specific locale supported by ICU) * and the actual locale is "root" (the collation data comes unmodified * from the UCA) * The locale is considered supported by ICU if there is a core ICU bundle * for that locale (although it may be empty). * @stable ICU 2.1 */ typedef enum { /** This is locale the data actually comes from * @stable ICU 2.1 */ ULOC_ACTUAL_LOCALE = 0, /** This is the most specific locale supported by ICU * @stable ICU 2.1 */ ULOC_VALID_LOCALE = 1, } ULocDataLocaleType; #ifndef U_HIDE_SYSTEM_API /** * Gets ICU's default locale. * The returned string is a snapshot in time, and will remain valid * and unchanged even when uloc_setDefault() is called. * The returned storage is owned by ICU, and must not be altered or deleted * by the caller. * * @return the ICU default locale * @system * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 uloc_getDefault(void); /** * Sets ICU's default locale. * By default (without calling this function), ICU's default locale will be based * on information obtained from the underlying system environment. *

* Changes to ICU's default locale do not propagate back to the * system environment. *

* Changes to ICU's default locale to not affect any ICU services that * may already be open based on the previous default locale value. * * @param localeID the new ICU default locale. A value of NULL will try to get * the system's default locale. * @param status the error information if the setting of default locale fails * @system * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 uloc_setDefault(const char* localeID, UErrorCode* status); #endif /* U_HIDE_SYSTEM_API */ /** * Gets the language code for the specified locale. * * @param localeID the locale to get the ISO language code with * @param language the language code for localeID * @param languageCapacity the size of the language buffer to store the * language code with * @param err error information if retrieving the language code failed * @return the actual buffer size needed for the language code. If it's greater * than languageCapacity, the returned language code will be truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getLanguage(const char* localeID, char* language, int32_t languageCapacity, UErrorCode* err); /** * Gets the script code for the specified locale. * * @param localeID the locale to get the ISO language code with * @param script the language code for localeID * @param scriptCapacity the size of the language buffer to store the * language code with * @param err error information if retrieving the language code failed * @return the actual buffer size needed for the language code. If it's greater * than scriptCapacity, the returned language code will be truncated. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 uloc_getScript(const char* localeID, char* script, int32_t scriptCapacity, UErrorCode* err); /** * Gets the country code for the specified locale. * * @param localeID the locale to get the country code with * @param country the country code for localeID * @param countryCapacity the size of the country buffer to store the * country code with * @param err error information if retrieving the country code failed * @return the actual buffer size needed for the country code. If it's greater * than countryCapacity, the returned country code will be truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getCountry(const char* localeID, char* country, int32_t countryCapacity, UErrorCode* err); /** * Gets the variant code for the specified locale. * * @param localeID the locale to get the variant code with * @param variant the variant code for localeID * @param variantCapacity the size of the variant buffer to store the * variant code with * @param err error information if retrieving the variant code failed * @return the actual buffer size needed for the variant code. If it's greater * than variantCapacity, the returned variant code will be truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getVariant(const char* localeID, char* variant, int32_t variantCapacity, UErrorCode* err); /** * Gets the full name for the specified locale. * Note: This has the effect of 'canonicalizing' the ICU locale ID to * a certain extent. Upper and lower case are set as needed. * It does NOT map aliased names in any way. * See the top of this header file. * This API supports preflighting. * * @param localeID the locale to get the full name with * @param name fill in buffer for the name without keywords. * @param nameCapacity capacity of the fill in buffer. * @param err error information if retrieving the full name failed * @return the actual buffer size needed for the full name. If it's greater * than nameCapacity, the returned full name will be truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getName(const char* localeID, char* name, int32_t nameCapacity, UErrorCode* err); /** * Gets the full name for the specified locale. * Note: This has the effect of 'canonicalizing' the string to * a certain extent. Upper and lower case are set as needed, * and if the components were in 'POSIX' format they are changed to * ICU format. It does NOT map aliased names in any way. * See the top of this header file. * * @param localeID the locale to get the full name with * @param name the full name for localeID * @param nameCapacity the size of the name buffer to store the * full name with * @param err error information if retrieving the full name failed * @return the actual buffer size needed for the full name. If it's greater * than nameCapacity, the returned full name will be truncated. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 uloc_canonicalize(const char* localeID, char* name, int32_t nameCapacity, UErrorCode* err); /** * Gets the ISO language code for the specified locale. * * @param localeID the locale to get the ISO language code with * @return language the ISO language code for localeID * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 uloc_getISO3Language(const char* localeID); /** * Gets the ISO country code for the specified locale. * * @param localeID the locale to get the ISO country code with * @return country the ISO country code for localeID * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 uloc_getISO3Country(const char* localeID); /** * Gets the Win32 LCID value for the specified locale. * If the ICU locale is not recognized by Windows, 0 will be returned. * * LCIDs were deprecated with Windows Vista and Microsoft recommends * that developers use BCP47 style tags instead (uloc_toLanguageTag). * * @param localeID the locale to get the Win32 LCID value with * @return country the Win32 LCID for localeID * @stable ICU 2.0 */ U_CAPI uint32_t U_EXPORT2 uloc_getLCID(const char* localeID); /** * Gets the language name suitable for display for the specified locale. * * @param locale the locale to get the ISO language code with * @param displayLocale Specifies the locale to be used to display the name. In * other words, if the locale's language code is "en", passing * Locale::getFrench() for inLocale would result in "Anglais", * while passing Locale::getGerman() for inLocale would result * in "Englisch". * @param language the displayable language code for localeID * @param languageCapacity the size of the language buffer to store the * displayable language code with. * @param status error information if retrieving the displayable language code * failed. U_USING_DEFAULT_WARNING indicates that no data was * found from the locale resources and a case canonicalized * language code is placed into language as fallback. * @return the actual buffer size needed for the displayable language code. If * it's greater than languageCapacity, the returned language * code will be truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getDisplayLanguage(const char* locale, const char* displayLocale, UChar* language, int32_t languageCapacity, UErrorCode* status); /** * Gets the script name suitable for display for the specified locale. * * @param locale the locale to get the displayable script code with. NULL may be * used to specify the default. * @param displayLocale Specifies the locale to be used to display the name. In * other words, if the locale's language code is "en", passing * Locale::getFrench() for inLocale would result in "", while * passing Locale::getGerman() for inLocale would result in "". * NULL may be used to specify the default. * @param script the displayable script for the localeID. * @param scriptCapacity the size of the script buffer to store the displayable * script code with. * @param status error information if retrieving the displayable script code * failed. U_USING_DEFAULT_WARNING indicates that no data was * found from the locale resources and a case canonicalized * script code is placed into script as fallback. * @return the actual buffer size needed for the displayable script code. If * it's greater than scriptCapacity, the returned displayable * script code will be truncated. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 uloc_getDisplayScript(const char* locale, const char* displayLocale, UChar* script, int32_t scriptCapacity, UErrorCode* status); /** * Gets the country name suitable for display for the specified locale. * Warning: this is for the region part of a valid locale ID; it cannot just be * the region code (like "FR"). To get the display name for a region alone, or * for other options, use ULocaleDisplayNames instead. * * @param locale the locale to get the displayable country code with. NULL may * be used to specify the default. * @param displayLocale Specifies the locale to be used to display the name. In * other words, if the locale's language code is "en", passing * Locale::getFrench() for inLocale would result in "Anglais", * while passing Locale::getGerman() for inLocale would result * in "Englisch". NULL may be used to specify the default. * @param country the displayable country code for localeID. * @param countryCapacity the size of the country buffer to store the * displayable country code with. * @param status error information if retrieving the displayable country code * failed. U_USING_DEFAULT_WARNING indicates that no data was * found from the locale resources and a case canonicalized * country code is placed into country as fallback. * @return the actual buffer size needed for the displayable country code. If * it's greater than countryCapacity, the returned displayable * country code will be truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getDisplayCountry(const char* locale, const char* displayLocale, UChar* country, int32_t countryCapacity, UErrorCode* status); /** * Gets the variant name suitable for display for the specified locale. * * @param locale the locale to get the displayable variant code with. NULL may * be used to specify the default. * @param displayLocale Specifies the locale to be used to display the name. In * other words, if the locale's language code is "en", passing * Locale::getFrench() for inLocale would result in "Anglais", * while passing Locale::getGerman() for inLocale would result * in "Englisch". NULL may be used to specify the default. * @param variant the displayable variant code for localeID. * @param variantCapacity the size of the variant buffer to store the * displayable variant code with. * @param status error information if retrieving the displayable variant code * failed. U_USING_DEFAULT_WARNING indicates that no data was * found from the locale resources and a case canonicalized * variant code is placed into variant as fallback. * @return the actual buffer size needed for the displayable variant code. If * it's greater than variantCapacity, the returned displayable * variant code will be truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getDisplayVariant(const char* locale, const char* displayLocale, UChar* variant, int32_t variantCapacity, UErrorCode* status); /** * Gets the keyword name suitable for display for the specified locale. E.g: * for the locale string de_DE\@collation=PHONEBOOK, this API gets the display * string for the keyword collation. * Usage: * * UErrorCode status = U_ZERO_ERROR; * const char* keyword =NULL; * int32_t keywordLen = 0; * int32_t keywordCount = 0; * UChar displayKeyword[256]; * int32_t displayKeywordLen = 0; * UEnumeration* keywordEnum = uloc_openKeywords("de_DE@collation=PHONEBOOK;calendar=TRADITIONAL", &status); * for(keywordCount = uenum_count(keywordEnum, &status); keywordCount > 0 ; keywordCount--){ * if(U_FAILURE(status)){ * ...something went wrong so handle the error... * break; * } * // the uenum_next returns NUL terminated string * keyword = uenum_next(keywordEnum, &keywordLen, &status); * displayKeywordLen = uloc_getDisplayKeyword(keyword, "en_US", displayKeyword, 256); * ... do something interesting ..... * } * uenum_close(keywordEnum); * * @param keyword The keyword whose display string needs to be returned. * @param displayLocale Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". NULL may be used to specify the default. * @param dest the buffer to which the displayable keyword should be written. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param status error information if retrieving the displayable string failed. * Should not be NULL and should not indicate failure on entry. * U_USING_DEFAULT_WARNING indicates that no data was found from the locale * resources and the keyword is placed into dest as fallback. * @return the actual buffer size needed for the displayable variant code. * @see #uloc_openKeywords * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 uloc_getDisplayKeyword(const char* keyword, const char* displayLocale, UChar* dest, int32_t destCapacity, UErrorCode* status); /** * Gets the value of the keyword suitable for display for the specified locale. * E.g: for the locale string de_DE\@collation=PHONEBOOK, this API gets the display * string for PHONEBOOK, in the display locale, when "collation" is specified as the keyword. * * @param locale The locale to get the displayable variant code with. NULL may be used to specify the default. * @param keyword The keyword for whose value should be used. * @param displayLocale Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". NULL may be used to specify the default. * @param dest the buffer to which the displayable keyword should be written. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param status error information if retrieving the displayable string failed. * Should not be NULL and must not indicate failure on entry. * U_USING_DEFAULT_WARNING indicates that no data was found from the locale * resources and the value of the keyword is placed into dest as fallback. * @return the actual buffer size needed for the displayable variant code. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 uloc_getDisplayKeywordValue( const char* locale, const char* keyword, const char* displayLocale, UChar* dest, int32_t destCapacity, UErrorCode* status); /** * Gets the full name suitable for display for the specified locale. * * @param localeID the locale to get the displayable name with. NULL may be used to specify the default. * @param inLocaleID Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". NULL may be used to specify the default. * @param result the displayable name for localeID * @param maxResultSize the size of the name buffer to store the * displayable full name with * @param err error information if retrieving the displayable name failed * @return the actual buffer size needed for the displayable name. If it's greater * than maxResultSize, the returned displayable name will be truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getDisplayName(const char* localeID, const char* inLocaleID, UChar* result, int32_t maxResultSize, UErrorCode* err); /** * Gets the specified locale from a list of available locales. * * This method corresponds to uloc_openAvailableByType called with the * ULOC_AVAILABLE_DEFAULT type argument. * * The return value is a pointer to an item of a locale name array. Both this * array and the pointers it contains are owned by ICU and should not be * deleted or written through by the caller. The locale name is terminated by * a null pointer. * * @param n the specific locale name index of the available locale list; * should not exceed the number returned by uloc_countAvailable. * @return a specified locale name of all available locales * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 uloc_getAvailable(int32_t n); /** * Gets the size of the all available locale list. * * @return the size of the locale list * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_countAvailable(void); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Types for uloc_getAvailableByType and uloc_countAvailableByType. * * @stable ICU 65 */ typedef enum ULocAvailableType { /** * Locales that return data when passed to ICU APIs, * but not including legacy or alias locales. * * @stable ICU 65 */ ULOC_AVAILABLE_DEFAULT, /** * Legacy or alias locales that return data when passed to ICU APIs. * Examples of supported legacy or alias locales: * * - iw (alias to he) * - mo (alias to ro) * - zh_CN (alias to zh_Hans_CN) * - sr_BA (alias to sr_Cyrl_BA) * - ars (alias to ar_SA) * * The locales in this set are disjoint from the ones in * ULOC_AVAILABLE_DEFAULT. To get both sets at the same time, use * ULOC_AVAILABLE_WITH_LEGACY_ALIASES. * * @stable ICU 65 */ ULOC_AVAILABLE_ONLY_LEGACY_ALIASES, /** * The union of the locales in ULOC_AVAILABLE_DEFAULT and * ULOC_AVAILABLE_ONLY_LEGACY_ALIAS. * * @stable ICU 65 */ ULOC_AVAILABLE_WITH_LEGACY_ALIASES, } ULocAvailableType; /** * Gets a list of available locales according to the type argument, allowing * the user to access different sets of supported locales in ICU. * * The returned UEnumeration must be closed by the caller. * * @param type Type choice from ULocAvailableType. * @param status Set if an error occurred. * @return a UEnumeration owned by the caller, or nullptr on failure. * @stable ICU 65 */ U_CAPI UEnumeration* U_EXPORT2 uloc_openAvailableByType(ULocAvailableType type, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * * Gets a list of all available 2-letter language codes defined in ISO 639, * plus additional 3-letter codes determined to be useful for locale generation as * defined by Unicode CLDR. This is a pointer * to an array of pointers to arrays of char. All of these pointers are owned * by ICU-- do not delete them, and do not write through them. The array is * terminated with a null pointer. * @return a list of all available language codes * @stable ICU 2.0 */ U_CAPI const char* const* U_EXPORT2 uloc_getISOLanguages(void); /** * * Gets a list of all available 2-letter country codes defined in ISO 639. This is a * pointer to an array of pointers to arrays of char. All of these pointers are * owned by ICU-- do not delete them, and do not write through them. The array is * terminated with a null pointer. * @return a list of all available country codes * @stable ICU 2.0 */ U_CAPI const char* const* U_EXPORT2 uloc_getISOCountries(void); /** * Truncate the locale ID string to get the parent locale ID. * Copies the part of the string before the last underscore. * The parent locale ID will be an empty string if there is no * underscore, or if there is only one underscore at localeID[0]. * * @param localeID Input locale ID string. * @param parent Output string buffer for the parent locale ID. * @param parentCapacity Size of the output buffer. * @param err A UErrorCode value. * @return The length of the parent locale ID. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 uloc_getParent(const char* localeID, char* parent, int32_t parentCapacity, UErrorCode* err); /** * Gets the full name for the specified locale, like uloc_getName(), * but without keywords. * * Note: This has the effect of 'canonicalizing' the string to * a certain extent. Upper and lower case are set as needed, * and if the components were in 'POSIX' format they are changed to * ICU format. It does NOT map aliased names in any way. * See the top of this header file. * * This API strips off the keyword part, so "de_DE\@collation=phonebook" * will become "de_DE". * This API supports preflighting. * * @param localeID the locale to get the full name with * @param name fill in buffer for the name without keywords. * @param nameCapacity capacity of the fill in buffer. * @param err error information if retrieving the full name failed * @return the actual buffer size needed for the full name. If it's greater * than nameCapacity, the returned full name will be truncated. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 uloc_getBaseName(const char* localeID, char* name, int32_t nameCapacity, UErrorCode* err); /** * Gets an enumeration of keywords for the specified locale. Enumeration * must get disposed of by the client using uenum_close function. * * @param localeID the locale to get the variant code with * @param status error information if retrieving the keywords failed * @return enumeration of keywords or NULL if there are no keywords. * @stable ICU 2.8 */ U_CAPI UEnumeration* U_EXPORT2 uloc_openKeywords(const char* localeID, UErrorCode* status); /** * Get the value for a keyword. Locale name does not need to be normalized. * * @param localeID locale name containing the keyword ("de_DE@currency=EURO;collation=PHONEBOOK") * @param keywordName name of the keyword for which we want the value; must not be * NULL or empty, and must consist only of [A-Za-z0-9]. Case insensitive. * @param buffer receiving buffer * @param bufferCapacity capacity of receiving buffer * @param status containing error code: e.g. buffer not big enough or ill-formed localeID * or keywordName parameters. * @return the length of keyword value * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 uloc_getKeywordValue(const char* localeID, const char* keywordName, char* buffer, int32_t bufferCapacity, UErrorCode* status); /** * Sets or removes the value of the specified keyword. * * For removing all keywords, use uloc_getBaseName(). * * NOTE: Unlike almost every other ICU function which takes a * buffer, this function will NOT truncate the output text, and will * not update the buffer with unterminated text setting a status of * U_STRING_NOT_TERMINATED_WARNING. If a BUFFER_OVERFLOW_ERROR is received, * it means a terminated version of the updated locale ID would not fit * in the buffer, and the original buffer is untouched. This is done to * prevent incorrect or possibly even malformed locales from being generated * and used. * * @param keywordName name of the keyword to be set; must not be * NULL or empty, and must consist only of [A-Za-z0-9]. Case insensitive. * @param keywordValue value of the keyword to be set. If 0-length or * NULL, will result in the keyword being removed; no error is given if * that keyword does not exist. Otherwise, must consist only of * [A-Za-z0-9] and [/_+-]. * @param buffer input buffer containing well-formed locale ID to be * modified. * @param bufferCapacity capacity of receiving buffer * @param status containing error code: e.g. buffer not big enough * or ill-formed keywordName or keywordValue parameters, or ill-formed * locale ID in buffer on input. * @return the length needed for the buffer * @see uloc_getKeywordValue * @stable ICU 3.2 */ U_CAPI int32_t U_EXPORT2 uloc_setKeywordValue(const char* keywordName, const char* keywordValue, char* buffer, int32_t bufferCapacity, UErrorCode* status); /** * Returns whether the locale's script is written right-to-left. * If there is no script subtag, then the likely script is used, see uloc_addLikelySubtags(). * If no likely script is known, then false is returned. * * A script is right-to-left according to the CLDR script metadata * which corresponds to whether the script's letters have Bidi_Class=R or AL. * * Returns true for "ar" and "en-Hebr", false for "zh" and "fa-Cyrl". * * @param locale input locale ID * @return true if the locale's script is written right-to-left * @stable ICU 54 */ U_CAPI UBool U_EXPORT2 uloc_isRightToLeft(const char *locale); /** * enums for the return value for the character and line orientation * functions. * @stable ICU 4.0 */ typedef enum { ULOC_LAYOUT_LTR = 0, /* left-to-right. */ ULOC_LAYOUT_RTL = 1, /* right-to-left. */ ULOC_LAYOUT_TTB = 2, /* top-to-bottom. */ ULOC_LAYOUT_BTT = 3, /* bottom-to-top. */ ULOC_LAYOUT_UNKNOWN } ULayoutType; /** * Get the layout character orientation for the specified locale. * * @param localeId locale name * @param status Error status * @return an enum indicating the layout orientation for characters. * @stable ICU 4.0 */ U_CAPI ULayoutType U_EXPORT2 uloc_getCharacterOrientation(const char* localeId, UErrorCode *status); /** * Get the layout line orientation for the specified locale. * * @param localeId locale name * @param status Error status * @return an enum indicating the layout orientation for lines. * @stable ICU 4.0 */ U_CAPI ULayoutType U_EXPORT2 uloc_getLineOrientation(const char* localeId, UErrorCode *status); /** * Output values which uloc_acceptLanguage() writes to the 'outResult' parameter. * * @see uloc_acceptLanguageFromHTTP * @see uloc_acceptLanguage * @stable ICU 3.2 */ typedef enum { /** * No exact match was found. * @stable ICU 3.2 */ ULOC_ACCEPT_FAILED = 0, /** * An exact match was found. * @stable ICU 3.2 */ ULOC_ACCEPT_VALID = 1, /** * A fallback was found. For example, the Accept-Language list includes 'ja_JP' * and is matched with available locale 'ja'. * @stable ICU 3.2 */ ULOC_ACCEPT_FALLBACK = 2 /* */ } UAcceptResult; /** * Based on a HTTP header from a web browser and a list of available locales, * determine an acceptable locale for the user. * * This is a thin wrapper over C++ class LocaleMatcher. * * @param result - buffer to accept the result locale * @param resultAvailable the size of the result buffer. * @param outResult - An out parameter that contains the fallback status * @param httpAcceptLanguage - "Accept-Language:" header as per HTTP. * @param availableLocales - list of available locales to match * @param status ICU error code. Its input value must pass the U_SUCCESS() test, * or else the function returns immediately. Check for U_FAILURE() * on output or use with function chaining. (See User Guide for details.) * @return length needed for the locale. * @stable ICU 3.2 */ U_CAPI int32_t U_EXPORT2 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult, const char *httpAcceptLanguage, UEnumeration* availableLocales, UErrorCode *status); /** * Based on a list of available locales, * determine an acceptable locale for the user. * * This is a thin wrapper over C++ class LocaleMatcher. * * @param result - buffer to accept the result locale * @param resultAvailable the size of the result buffer. * @param outResult - An out parameter that contains the fallback status * @param acceptList - list of acceptable languages * @param acceptListCount - count of acceptList items * @param availableLocales - list of available locales to match * @param status ICU error code. Its input value must pass the U_SUCCESS() test, * or else the function returns immediately. Check for U_FAILURE() * on output or use with function chaining. (See User Guide for details.) * @return length needed for the locale. * @stable ICU 3.2 */ U_CAPI int32_t U_EXPORT2 uloc_acceptLanguage(char *result, int32_t resultAvailable, UAcceptResult *outResult, const char **acceptList, int32_t acceptListCount, UEnumeration* availableLocales, UErrorCode *status); /** * Gets the ICU locale ID for the specified Win32 LCID value. * * @param hostID the Win32 LCID to translate * @param locale the output buffer for the ICU locale ID, which will be NUL-terminated * if there is room. * @param localeCapacity the size of the output buffer * @param status an error is returned if the LCID is unrecognized or the output buffer * is too small * @return actual the actual size of the locale ID, not including NUL-termination * @stable ICU 3.8 */ U_CAPI int32_t U_EXPORT2 uloc_getLocaleForLCID(uint32_t hostID, char *locale, int32_t localeCapacity, UErrorCode *status); /** * Add the likely subtags for a provided locale ID, per the algorithm described * in the following CLDR technical report: * * http://www.unicode.org/reports/tr35/#Likely_Subtags * * If localeID is already in the maximal form, or there is no data available * for maximization, it will be copied to the output buffer. For example, * "und-Zzzz" cannot be maximized, since there is no reasonable maximization. * * Examples: * * "en" maximizes to "en_Latn_US" * * "de" maximizes to "de_Latn_US" * * "sr" maximizes to "sr_Cyrl_RS" * * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.) * * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.) * * @param localeID The locale to maximize * @param maximizedLocaleID The maximized locale * @param maximizedLocaleIDCapacity The capacity of the maximizedLocaleID buffer * @param err Error information if maximizing the locale failed. If the length * of the localeID and the null-terminator is greater than the maximum allowed size, * or the localeId is not well-formed, the error code is U_ILLEGAL_ARGUMENT_ERROR. * @return The actual buffer size needed for the maximized locale. If it's * greater than maximizedLocaleIDCapacity, the returned ID will be truncated. * On error, the return value is -1. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 uloc_addLikelySubtags(const char* localeID, char* maximizedLocaleID, int32_t maximizedLocaleIDCapacity, UErrorCode* err); /** * Minimize the subtags for a provided locale ID, per the algorithm described * in the following CLDR technical report: * * http://www.unicode.org/reports/tr35/#Likely_Subtags * * If localeID is already in the minimal form, or there is no data available * for minimization, it will be copied to the output buffer. Since the * minimization algorithm relies on proper maximization, see the comments * for uloc_addLikelySubtags for reasons why there might not be any data. * * Examples: * * "en_Latn_US" minimizes to "en" * * "de_Latn_US" minimizes to "de" * * "sr_Cyrl_RS" minimizes to "sr" * * "zh_Hant_TW" minimizes to "zh_TW" (The region is preferred to the * script, and minimizing to "zh" would imply "zh_Hans_CN".) * * @param localeID The locale to minimize * @param minimizedLocaleID The minimized locale * @param minimizedLocaleIDCapacity The capacity of the minimizedLocaleID buffer * @param err Error information if minimizing the locale failed. If the length * of the localeID and the null-terminator is greater than the maximum allowed size, * or the localeId is not well-formed, the error code is U_ILLEGAL_ARGUMENT_ERROR. * @return The actual buffer size needed for the minimized locale. If it's * greater than minimizedLocaleIDCapacity, the returned ID will be truncated. * On error, the return value is -1. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 uloc_minimizeSubtags(const char* localeID, char* minimizedLocaleID, int32_t minimizedLocaleIDCapacity, UErrorCode* err); /** * Returns a locale ID for the specified BCP47 language tag string. * If the specified language tag contains any ill-formed subtags, * the first such subtag and all following subtags are ignored. *

* This implements the 'Language-Tag' production of BCP 47, and so * supports legacy language tags (marked as “Type: grandfathered” in BCP 47) * (regular and irregular) as well as private use language tags. * * Private use tags are represented as 'x-whatever', * and legacy tags are converted to their canonical replacements where they exist. * * Note that a few legacy tags have no modern replacement; * these will be converted using the fallback described in * the first paragraph, so some information might be lost. * * @param langtag the input BCP47 language tag. * @param localeID the output buffer receiving a locale ID for the * specified BCP47 language tag. * @param localeIDCapacity the size of the locale ID output buffer. * @param parsedLength if not NULL, successfully parsed length * for the input language tag is set. * @param err error information if receiving the locald ID * failed. * @return the length of the locale ID. * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 uloc_forLanguageTag(const char* langtag, char* localeID, int32_t localeIDCapacity, int32_t* parsedLength, UErrorCode* err); /** * Returns a well-formed language tag for this locale ID. *

* Note: When strict is false, any locale * fields which do not satisfy the BCP47 syntax requirement will * be omitted from the result. When strict is * true, this function sets U_ILLEGAL_ARGUMENT_ERROR to the * err if any locale fields do not satisfy the * BCP47 syntax requirement. * @param localeID the input locale ID * @param langtag the output buffer receiving BCP47 language * tag for the locale ID. * @param langtagCapacity the size of the BCP47 language tag * output buffer. * @param strict boolean value indicating if the function returns * an error for an ill-formed input locale ID. * @param err error information if receiving the language * tag failed. * @return The length of the BCP47 language tag. * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 uloc_toLanguageTag(const char* localeID, char* langtag, int32_t langtagCapacity, UBool strict, UErrorCode* err); /** * Converts the specified keyword (legacy key, or BCP 47 Unicode locale * extension key) to the equivalent BCP 47 Unicode locale extension key. * For example, BCP 47 Unicode locale extension key "co" is returned for * the input keyword "collation". *

* When the specified keyword is unknown, but satisfies the BCP syntax, * then the pointer to the input keyword itself will be returned. * For example, * uloc_toUnicodeLocaleKey("ZZ") returns "ZZ". * * @param keyword the input locale keyword (either legacy key * such as "collation" or BCP 47 Unicode locale extension * key such as "co"). * @return the well-formed BCP 47 Unicode locale extension key, * or NULL if the specified locale keyword cannot be * mapped to a well-formed BCP 47 Unicode locale extension * key. * @see uloc_toLegacyKey * @stable ICU 54 */ U_CAPI const char* U_EXPORT2 uloc_toUnicodeLocaleKey(const char* keyword); /** * Converts the specified keyword value (legacy type, or BCP 47 * Unicode locale extension type) to the well-formed BCP 47 Unicode locale * extension type for the specified keyword (category). For example, BCP 47 * Unicode locale extension type "phonebk" is returned for the input * keyword value "phonebook", with the keyword "collation" (or "co"). *

* When the specified keyword is not recognized, but the specified value * satisfies the syntax of the BCP 47 Unicode locale extension type, * or when the specified keyword allows 'variable' type and the specified * value satisfies the syntax, then the pointer to the input type value itself * will be returned. * For example, * uloc_toUnicodeLocaleType("Foo", "Bar") returns "Bar", * uloc_toUnicodeLocaleType("variableTop", "00A4") returns "00A4". * * @param keyword the locale keyword (either legacy key such as * "collation" or BCP 47 Unicode locale extension * key such as "co"). * @param value the locale keyword value (either legacy type * such as "phonebook" or BCP 47 Unicode locale extension * type such as "phonebk"). * @return the well-formed BCP47 Unicode locale extension type, * or NULL if the locale keyword value cannot be mapped to * a well-formed BCP 47 Unicode locale extension type. * @see uloc_toLegacyType * @stable ICU 54 */ U_CAPI const char* U_EXPORT2 uloc_toUnicodeLocaleType(const char* keyword, const char* value); /** * Converts the specified keyword (BCP 47 Unicode locale extension key, or * legacy key) to the legacy key. For example, legacy key "collation" is * returned for the input BCP 47 Unicode locale extension key "co". * * @param keyword the input locale keyword (either BCP 47 Unicode locale * extension key or legacy key). * @return the well-formed legacy key, or NULL if the specified * keyword cannot be mapped to a well-formed legacy key. * @see toUnicodeLocaleKey * @stable ICU 54 */ U_CAPI const char* U_EXPORT2 uloc_toLegacyKey(const char* keyword); /** * Converts the specified keyword value (BCP 47 Unicode locale extension type, * or legacy type or type alias) to the canonical legacy type. For example, * the legacy type "phonebook" is returned for the input BCP 47 Unicode * locale extension type "phonebk" with the keyword "collation" (or "co"). *

* When the specified keyword is not recognized, but the specified value * satisfies the syntax of legacy key, or when the specified keyword * allows 'variable' type and the specified value satisfies the syntax, * then the pointer to the input type value itself will be returned. * For example, * uloc_toLegacyType("Foo", "Bar") returns "Bar", * uloc_toLegacyType("vt", "00A4") returns "00A4". * * @param keyword the locale keyword (either legacy keyword such as * "collation" or BCP 47 Unicode locale extension * key such as "co"). * @param value the locale keyword value (either BCP 47 Unicode locale * extension type such as "phonebk" or legacy keyword value * such as "phonebook"). * @return the well-formed legacy type, or NULL if the specified * keyword value cannot be mapped to a well-formed legacy * type. * @see toUnicodeLocaleType * @stable ICU 54 */ U_CAPI const char* U_EXPORT2 uloc_toLegacyType(const char* keyword, const char* value); #endif /*_ULOC*/ // ures.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File URES.H (formerly CRESBUND.H) * * Modification History: * * Date Name Description * 04/01/97 aliu Creation. * 02/22/99 damiba overhaul. * 04/04/99 helena Fixed internal header inclusion. * 04/15/99 Madhu Updated Javadoc * 06/14/99 stephen Removed functions taking a filename suffix. * 07/20/99 stephen Language-independent typedef to void* * 11/09/99 weiv Added ures_getLocale() * 06/24/02 weiv Added support for resource sharing ****************************************************************************** */ #ifndef URES_H #define URES_H /** * \file * \brief C API: Resource Bundle * *

C API: Resource Bundle

* * C API representing a collection of resource information pertaining to a given * locale. A resource bundle provides a way of accessing locale- specific information in * a data file. You create a resource bundle that manages the resources for a given * locale and then ask it for individual resources. *

* Resource bundles in ICU4C are currently defined using text files which conform to the following * BNF definition. * More on resource bundle concepts and syntax can be found in the * Users Guide. *

*/ /** * UResourceBundle is an opaque type for handles for resource bundles in C APIs. * @stable ICU 2.0 */ struct UResourceBundle; /** * @stable ICU 2.0 */ typedef struct UResourceBundle UResourceBundle; /** * Numeric constants for types of resource items. * @see ures_getType * @stable ICU 2.0 */ typedef enum { /** Resource type constant for "no resource". @stable ICU 2.6 */ URES_NONE=-1, /** Resource type constant for 16-bit Unicode strings. @stable ICU 2.6 */ URES_STRING=0, /** Resource type constant for binary data. @stable ICU 2.6 */ URES_BINARY=1, /** Resource type constant for tables of key-value pairs. @stable ICU 2.6 */ URES_TABLE=2, /** * Resource type constant for aliases; * internally stores a string which identifies the actual resource * storing the data (can be in a different resource bundle). * Resolved internally before delivering the actual resource through the API. * @stable ICU 2.6 */ URES_ALIAS=3, /** * Resource type constant for a single 28-bit integer, interpreted as * signed or unsigned by the ures_getInt() or ures_getUInt() function. * @see ures_getInt * @see ures_getUInt * @stable ICU 2.6 */ URES_INT=7, /** Resource type constant for arrays of resources. @stable ICU 2.6 */ URES_ARRAY=8, /** * Resource type constant for vectors of 32-bit integers. * @see ures_getIntVector * @stable ICU 2.6 */ URES_INT_VECTOR = 14, } UResType; /* * Functions to create and destroy resource bundles. */ /** * Opens a UResourceBundle, from which users can extract strings by using * their corresponding keys. * Note that the caller is responsible of calling ures_close on each successfully * opened resource bundle. * @param packageName The packageName and locale together point to an ICU udata object, * as defined by udata_open( packageName, "res", locale, err) * or equivalent. Typically, packageName will refer to a (.dat) file, or to * a package registered with udata_setAppData(). Using a full file or directory * pathname for packageName is deprecated. If NULL, ICU data will be used. * @param locale specifies the locale for which we want to open the resource * if NULL, the default locale will be used. If strlen(locale) == 0 * root locale will be used. * * @param status fills in the outgoing error code. * The UErrorCode err parameter is used to return status information to the user. To * check whether the construction succeeded or not, you should check the value of * U_SUCCESS(err). If you wish more detailed information, you can check for * informational status results which still indicate success. U_USING_FALLBACK_WARNING * indicates that a fall back locale was used. For example, 'de_CH' was requested, * but nothing was found there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that * the default locale data or root locale data was used; neither the requested locale * nor any of its fall back locales could be found. Please see the users guide for more * information on this topic. * @return a newly allocated resource bundle. * @see ures_close * @stable ICU 2.0 */ U_CAPI UResourceBundle* U_EXPORT2 ures_open(const char* packageName, const char* locale, UErrorCode* status); /** This function does not care what kind of localeID is passed in. It simply opens a bundle with * that name. Fallback mechanism is disabled for the new bundle. If the requested bundle contains * an %%ALIAS directive, the results are undefined. * @param packageName The packageName and locale together point to an ICU udata object, * as defined by udata_open( packageName, "res", locale, err) * or equivalent. Typically, packageName will refer to a (.dat) file, or to * a package registered with udata_setAppData(). Using a full file or directory * pathname for packageName is deprecated. If NULL, ICU data will be used. * @param locale specifies the locale for which we want to open the resource * if NULL, the default locale will be used. If strlen(locale) == 0 * root locale will be used. * * @param status fills in the outgoing error code. Either U_ZERO_ERROR or U_MISSING_RESOURCE_ERROR * @return a newly allocated resource bundle or NULL if it doesn't exist. * @see ures_close * @stable ICU 2.0 */ U_CAPI UResourceBundle* U_EXPORT2 ures_openDirect(const char* packageName, const char* locale, UErrorCode* status); /** * Same as ures_open() but takes a const UChar *path. * This path will be converted to char * using the default converter, * then ures_open() is called. * * @param packageName The packageName and locale together point to an ICU udata object, * as defined by udata_open( packageName, "res", locale, err) * or equivalent. Typically, packageName will refer to a (.dat) file, or to * a package registered with udata_setAppData(). Using a full file or directory * pathname for packageName is deprecated. If NULL, ICU data will be used. * @param locale specifies the locale for which we want to open the resource * if NULL, the default locale will be used. If strlen(locale) == 0 * root locale will be used. * @param status fills in the outgoing error code. * @return a newly allocated resource bundle. * @see ures_open * @stable ICU 2.0 */ U_CAPI UResourceBundle* U_EXPORT2 ures_openU(const UChar* packageName, const char* locale, UErrorCode* status); /** * Close a resource bundle, all pointers returned from the various ures_getXXX calls * on this particular bundle should be considered invalid henceforth. * * @param resourceBundle a pointer to a resourceBundle struct. Can be NULL. * @see ures_open * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ures_close(UResourceBundle* resourceBundle); /** * Return the version number associated with this ResourceBundle as an * UVersionInfo array. * * @param resB The resource bundle for which the version is checked. * @param versionInfo A UVersionInfo array that is filled with the version number * as specified in the resource bundle or its parent. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ures_getVersion(const UResourceBundle* resB, UVersionInfo versionInfo); /** * Return the name of the Locale associated with this ResourceBundle. * You can choose between requested, valid and real locale. * * @param resourceBundle resource bundle in question * @param type You can choose between requested, valid and actual * locale. For description see the definition of * ULocDataLocaleType in uloc.h * @param status just for catching illegal arguments * @return A Locale name * @stable ICU 2.8 */ U_CAPI const char* U_EXPORT2 ures_getLocaleByType(const UResourceBundle* resourceBundle, ULocDataLocaleType type, UErrorCode* status); /** * Returns a string from a string resource type * * @param resourceBundle a string resource * @param len fills in the length of resulting string * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * Always check the value of status. Don't count on returning NULL. * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file. * @see ures_getBinary * @see ures_getIntVector * @see ures_getInt * @see ures_getUInt * @stable ICU 2.0 */ U_CAPI const UChar* U_EXPORT2 ures_getString(const UResourceBundle* resourceBundle, int32_t* len, UErrorCode* status); /** * Returns a UTF-8 string from a string resource. * The UTF-8 string may be returnable directly as a pointer, or * it may need to be copied, or transformed from UTF-16 using u_strToUTF8() * or equivalent. * * If forceCopy==true, then the string is always written to the dest buffer * and dest is returned. * * If forceCopy==false, then the string is returned as a pointer if possible, * without needing a dest buffer (it can be NULL). If the string needs to be * copied or transformed, then it may be placed into dest at an arbitrary offset. * * If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual. * * If the string is transformed from UTF-16, then a conversion error may occur * if an unpaired surrogate is encountered. If the function is successful, then * the output UTF-8 string is always well-formed. * * @param resB Resource bundle. * @param dest Destination buffer. Can be NULL only if capacity=*length==0. * @param length Input: Capacity of destination buffer. * Output: Actual length of the UTF-8 string, not counting the * terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR. * Can be NULL, meaning capacity=0 and the string length is not * returned to the caller. * @param forceCopy If true, then the output string will always be written to * dest, with U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING set if appropriate. * If false, then the dest buffer may or may not contain a * copy of the string. dest may or may not be modified. * If a copy needs to be written, then the UErrorCode parameter * indicates overflow etc. as usual. * @param status Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to the UTF-8 string. It may be dest, or at some offset * from dest (only if !forceCopy), or in unrelated memory. * Always NUL-terminated unless the string was written to dest and * length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set). * * @see ures_getString * @see u_strToUTF8 * @stable ICU 3.6 */ U_CAPI const char * U_EXPORT2 ures_getUTF8String(const UResourceBundle *resB, char *dest, int32_t *length, UBool forceCopy, UErrorCode *status); /** * Returns a binary data from a binary resource. * * @param resourceBundle a string resource * @param len fills in the length of resulting byte chunk * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * Always check the value of status. Don't count on returning NULL. * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return a pointer to a chunk of unsigned bytes which live in a memory mapped/DLL file. * @see ures_getString * @see ures_getIntVector * @see ures_getInt * @see ures_getUInt * @stable ICU 2.0 */ U_CAPI const uint8_t* U_EXPORT2 ures_getBinary(const UResourceBundle* resourceBundle, int32_t* len, UErrorCode* status); /** * Returns a 32 bit integer array from a resource. * * @param resourceBundle an int vector resource * @param len fills in the length of resulting byte chunk * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * Always check the value of status. Don't count on returning NULL. * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return a pointer to a chunk of integers which live in a memory mapped/DLL file. * @see ures_getBinary * @see ures_getString * @see ures_getInt * @see ures_getUInt * @stable ICU 2.0 */ U_CAPI const int32_t* U_EXPORT2 ures_getIntVector(const UResourceBundle* resourceBundle, int32_t* len, UErrorCode* status); /** * Returns an unsigned integer from a resource. * This integer is originally 28 bits. * * @param resourceBundle a string resource * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return an integer value * @see ures_getInt * @see ures_getIntVector * @see ures_getBinary * @see ures_getString * @stable ICU 2.0 */ U_CAPI uint32_t U_EXPORT2 ures_getUInt(const UResourceBundle* resourceBundle, UErrorCode *status); /** * Returns a signed integer from a resource. * This integer is originally 28 bit and the sign gets propagated. * * @param resourceBundle a string resource * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return an integer value * @see ures_getUInt * @see ures_getIntVector * @see ures_getBinary * @see ures_getString * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ures_getInt(const UResourceBundle* resourceBundle, UErrorCode *status); /** * Returns the size of a resource. Size for scalar types is always 1, * and for vector/table types is the number of child resources. * @warning Integer array is treated as a scalar type. There are no * APIs to access individual members of an integer array. It * is always returned as a whole. * @param resourceBundle a resource * @return number of resources in a given resource. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ures_getSize(const UResourceBundle *resourceBundle); /** * Returns the type of a resource. Available types are defined in enum UResType * * @param resourceBundle a resource * @return type of the given resource. * @see UResType * @stable ICU 2.0 */ U_CAPI UResType U_EXPORT2 ures_getType(const UResourceBundle *resourceBundle); /** * Returns the key associated with a given resource. Not all the resources have a key - only * those that are members of a table. * * @param resourceBundle a resource * @return a key associated to this resource, or NULL if it doesn't have a key * @stable ICU 2.0 */ U_CAPI const char * U_EXPORT2 ures_getKey(const UResourceBundle *resourceBundle); /* ITERATION API This API provides means for iterating through a resource */ /** * Resets the internal context of a resource so that iteration starts from the first element. * * @param resourceBundle a resource * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ures_resetIterator(UResourceBundle *resourceBundle); /** * Checks whether the given resource has another element to iterate over. * * @param resourceBundle a resource * @return true if there are more elements, false if there is no more elements * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ures_hasNext(const UResourceBundle *resourceBundle); /** * Returns the next resource in a given resource or NULL if there are no more resources * to iterate over. Features a fill-in parameter. * * @param resourceBundle a resource * @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller. * Alternatively, you can supply a struct to be filled by this function. * @param status fills in the outgoing error code. You may still get a non NULL result even if an * error occurred. Check status instead. * @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it * @stable ICU 2.0 */ U_CAPI UResourceBundle* U_EXPORT2 ures_getNextResource(UResourceBundle *resourceBundle, UResourceBundle *fillIn, UErrorCode *status); /** * Returns the next string in a given resource or NULL if there are no more resources * to iterate over. * * @param resourceBundle a resource * @param len fill in length of the string * @param key fill in for key associated with this string. NULL if no key * @param status fills in the outgoing error code. If an error occurred, we may return NULL, but don't * count on it. Check status instead! * @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file. * @stable ICU 2.0 */ U_CAPI const UChar* U_EXPORT2 ures_getNextString(UResourceBundle *resourceBundle, int32_t* len, const char ** key, UErrorCode *status); /** * Returns the resource in a given resource at the specified index. Features a fill-in parameter. * * @param resourceBundle the resource bundle from which to get a sub-resource * @param indexR an index to the wanted resource. * @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller. * Alternatively, you can supply a struct to be filled by this function. * @param status fills in the outgoing error code. Don't count on NULL being returned if an error has * occurred. Check status instead. * @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it * @stable ICU 2.0 */ U_CAPI UResourceBundle* U_EXPORT2 ures_getByIndex(const UResourceBundle *resourceBundle, int32_t indexR, UResourceBundle *fillIn, UErrorCode *status); /** * Returns the string in a given resource at the specified index. * * @param resourceBundle a resource * @param indexS an index to the wanted string. * @param len fill in length of the string * @param status fills in the outgoing error code. If an error occurred, we may return NULL, but don't * count on it. Check status instead! * @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file. * @stable ICU 2.0 */ U_CAPI const UChar* U_EXPORT2 ures_getStringByIndex(const UResourceBundle *resourceBundle, int32_t indexS, int32_t* len, UErrorCode *status); /** * Returns a UTF-8 string from a resource at the specified index. * The UTF-8 string may be returnable directly as a pointer, or * it may need to be copied, or transformed from UTF-16 using u_strToUTF8() * or equivalent. * * If forceCopy==true, then the string is always written to the dest buffer * and dest is returned. * * If forceCopy==false, then the string is returned as a pointer if possible, * without needing a dest buffer (it can be NULL). If the string needs to be * copied or transformed, then it may be placed into dest at an arbitrary offset. * * If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual. * * If the string is transformed from UTF-16, then a conversion error may occur * if an unpaired surrogate is encountered. If the function is successful, then * the output UTF-8 string is always well-formed. * * @param resB Resource bundle. * @param stringIndex An index to the wanted string. * @param dest Destination buffer. Can be NULL only if capacity=*length==0. * @param pLength Input: Capacity of destination buffer. * Output: Actual length of the UTF-8 string, not counting the * terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR. * Can be NULL, meaning capacity=0 and the string length is not * returned to the caller. * @param forceCopy If true, then the output string will always be written to * dest, with U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING set if appropriate. * If false, then the dest buffer may or may not contain a * copy of the string. dest may or may not be modified. * If a copy needs to be written, then the UErrorCode parameter * indicates overflow etc. as usual. * @param status Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to the UTF-8 string. It may be dest, or at some offset * from dest (only if !forceCopy), or in unrelated memory. * Always NUL-terminated unless the string was written to dest and * length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set). * * @see ures_getStringByIndex * @see u_strToUTF8 * @stable ICU 3.6 */ U_CAPI const char * U_EXPORT2 ures_getUTF8StringByIndex(const UResourceBundle *resB, int32_t stringIndex, char *dest, int32_t *pLength, UBool forceCopy, UErrorCode *status); /** * Returns a resource in a given resource that has a given key. This procedure works only with table * resources. Features a fill-in parameter. * * @param resourceBundle a resource * @param key a key associated with the wanted resource * @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller. * Alternatively, you can supply a struct to be filled by this function. * @param status fills in the outgoing error code. * @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it * @stable ICU 2.0 */ U_CAPI UResourceBundle* U_EXPORT2 ures_getByKey(const UResourceBundle *resourceBundle, const char* key, UResourceBundle *fillIn, UErrorCode *status); /** * Returns a string in a given resource that has a given key. This procedure works only with table * resources. * * @param resB a resource * @param key a key associated with the wanted string * @param len fill in length of the string * @param status fills in the outgoing error code. If an error occurred, we may return NULL, but don't * count on it. Check status instead! * @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file. * @stable ICU 2.0 */ U_CAPI const UChar* U_EXPORT2 ures_getStringByKey(const UResourceBundle *resB, const char* key, int32_t* len, UErrorCode *status); /** * Returns a UTF-8 string from a resource and a key. * This function works only with table resources. * * The UTF-8 string may be returnable directly as a pointer, or * it may need to be copied, or transformed from UTF-16 using u_strToUTF8() * or equivalent. * * If forceCopy==true, then the string is always written to the dest buffer * and dest is returned. * * If forceCopy==false, then the string is returned as a pointer if possible, * without needing a dest buffer (it can be NULL). If the string needs to be * copied or transformed, then it may be placed into dest at an arbitrary offset. * * If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual. * * If the string is transformed from UTF-16, then a conversion error may occur * if an unpaired surrogate is encountered. If the function is successful, then * the output UTF-8 string is always well-formed. * * @param resB Resource bundle. * @param key A key associated with the wanted resource * @param dest Destination buffer. Can be NULL only if capacity=*length==0. * @param pLength Input: Capacity of destination buffer. * Output: Actual length of the UTF-8 string, not counting the * terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR. * Can be NULL, meaning capacity=0 and the string length is not * returned to the caller. * @param forceCopy If true, then the output string will always be written to * dest, with U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING set if appropriate. * If false, then the dest buffer may or may not contain a * copy of the string. dest may or may not be modified. * If a copy needs to be written, then the UErrorCode parameter * indicates overflow etc. as usual. * @param status Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to the UTF-8 string. It may be dest, or at some offset * from dest (only if !forceCopy), or in unrelated memory. * Always NUL-terminated unless the string was written to dest and * length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set). * * @see ures_getStringByKey * @see u_strToUTF8 * @stable ICU 3.6 */ U_CAPI const char * U_EXPORT2 ures_getUTF8StringByKey(const UResourceBundle *resB, const char *key, char *dest, int32_t *pLength, UBool forceCopy, UErrorCode *status); /** * Create a string enumerator, owned by the caller, of all locales located within * the specified resource tree. * @param packageName name of the tree, such as (NULL) or U_ICUDATA_ALIAS or or "ICUDATA-coll" * This call is similar to uloc_getAvailable(). * @param status error code * @stable ICU 3.2 */ U_CAPI UEnumeration* U_EXPORT2 ures_openAvailableLocales(const char *packageName, UErrorCode *status); #endif /*_URES*/ /*eof*/ // udisplaycontext.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2014-2016, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef UDISPLAYCONTEXT_H #define UDISPLAYCONTEXT_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Display context types (enum values) */ /** * Display context types, for getting values of a particular setting. * Note, the specific numeric values are internal and may change. * @stable ICU 51 */ enum UDisplayContextType { /** * Type to retrieve the dialect handling setting, e.g. * UDISPCTX_STANDARD_NAMES or UDISPCTX_DIALECT_NAMES. * @stable ICU 51 */ UDISPCTX_TYPE_DIALECT_HANDLING = 0, /** * Type to retrieve the capitalization context setting, e.g. * UDISPCTX_CAPITALIZATION_NONE, UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE, * UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE, etc. * @stable ICU 51 */ UDISPCTX_TYPE_CAPITALIZATION = 1, /** * Type to retrieve the display length setting, e.g. * UDISPCTX_LENGTH_FULL, UDISPCTX_LENGTH_SHORT. * @stable ICU 54 */ UDISPCTX_TYPE_DISPLAY_LENGTH = 2, /** * Type to retrieve the substitute handling setting, e.g. * UDISPCTX_SUBSTITUTE, UDISPCTX_NO_SUBSTITUTE. * @stable ICU 58 */ UDISPCTX_TYPE_SUBSTITUTE_HANDLING = 3 }; /** * @stable ICU 51 */ typedef enum UDisplayContextType UDisplayContextType; /** * Display context settings. * Note, the specific numeric values are internal and may change. * @stable ICU 51 */ enum UDisplayContext { /** * ================================ * DIALECT_HANDLING can be set to one of UDISPCTX_STANDARD_NAMES or * UDISPCTX_DIALECT_NAMES. Use UDisplayContextType UDISPCTX_TYPE_DIALECT_HANDLING * to get the value. */ /** * A possible setting for DIALECT_HANDLING: * use standard names when generating a locale name, * e.g. en_GB displays as 'English (United Kingdom)'. * @stable ICU 51 */ UDISPCTX_STANDARD_NAMES = (UDISPCTX_TYPE_DIALECT_HANDLING<<8) + 0, /** * A possible setting for DIALECT_HANDLING: * use dialect names, when generating a locale name, * e.g. en_GB displays as 'British English'. * @stable ICU 51 */ UDISPCTX_DIALECT_NAMES = (UDISPCTX_TYPE_DIALECT_HANDLING<<8) + 1, /** * ================================ * CAPITALIZATION can be set to one of UDISPCTX_CAPITALIZATION_NONE, * UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE, * UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE, * UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU, or * UDISPCTX_CAPITALIZATION_FOR_STANDALONE. * Use UDisplayContextType UDISPCTX_TYPE_CAPITALIZATION to get the value. */ /** * The capitalization context to be used is unknown (this is the default value). * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_NONE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 0, /** * The capitalization context if a date, date symbol or display name is to be * formatted with capitalization appropriate for the middle of a sentence. * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 1, /** * The capitalization context if a date, date symbol or display name is to be * formatted with capitalization appropriate for the beginning of a sentence. * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 2, /** * The capitalization context if a date, date symbol or display name is to be * formatted with capitalization appropriate for a user-interface list or menu item. * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 3, /** * The capitalization context if a date, date symbol or display name is to be * formatted with capitalization appropriate for stand-alone usage such as an * isolated name on a calendar page. * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_FOR_STANDALONE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 4, /** * ================================ * DISPLAY_LENGTH can be set to one of UDISPCTX_LENGTH_FULL or * UDISPCTX_LENGTH_SHORT. Use UDisplayContextType UDISPCTX_TYPE_DISPLAY_LENGTH * to get the value. */ /** * A possible setting for DISPLAY_LENGTH: * use full names when generating a locale name, * e.g. "United States" for US. * @stable ICU 54 */ UDISPCTX_LENGTH_FULL = (UDISPCTX_TYPE_DISPLAY_LENGTH<<8) + 0, /** * A possible setting for DISPLAY_LENGTH: * use short names when generating a locale name, * e.g. "U.S." for US. * @stable ICU 54 */ UDISPCTX_LENGTH_SHORT = (UDISPCTX_TYPE_DISPLAY_LENGTH<<8) + 1, /** * ================================ * SUBSTITUTE_HANDLING can be set to one of UDISPCTX_SUBSTITUTE or * UDISPCTX_NO_SUBSTITUTE. Use UDisplayContextType UDISPCTX_TYPE_SUBSTITUTE_HANDLING * to get the value. */ /** * A possible setting for SUBSTITUTE_HANDLING: * Returns a fallback value (e.g., the input code) when no data is available. * This is the default value. * @stable ICU 58 */ UDISPCTX_SUBSTITUTE = (UDISPCTX_TYPE_SUBSTITUTE_HANDLING<<8) + 0, /** * A possible setting for SUBSTITUTE_HANDLING: * Returns a null value with error code set to U_ILLEGAL_ARGUMENT_ERROR when no * data is available. * @stable ICU 58 */ UDISPCTX_NO_SUBSTITUTE = (UDISPCTX_TYPE_SUBSTITUTE_HANDLING<<8) + 1 }; /** * @stable ICU 51 */ typedef enum UDisplayContext UDisplayContext; #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // uldnames.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2010-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #ifndef __ULDNAMES_H__ #define __ULDNAMES_H__ /** * \file * \brief C API: Provides display names of Locale ids and their components. */ /** * Enum used in LocaleDisplayNames::createInstance. * @stable ICU 4.4 */ typedef enum { /** * Use standard names when generating a locale name, * e.g. en_GB displays as 'English (United Kingdom)'. * @stable ICU 4.4 */ ULDN_STANDARD_NAMES = 0, /** * Use dialect names, when generating a locale name, * e.g. en_GB displays as 'British English'. * @stable ICU 4.4 */ ULDN_DIALECT_NAMES } UDialectHandling; /** * Opaque C service object type for the locale display names API * @stable ICU 4.4 */ struct ULocaleDisplayNames; /** * C typedef for struct ULocaleDisplayNames. * @stable ICU 4.4 */ typedef struct ULocaleDisplayNames ULocaleDisplayNames; #if !UCONFIG_NO_FORMATTING /** * Returns an instance of LocaleDisplayNames that returns names * formatted for the provided locale, using the provided * dialectHandling. The usual value for dialectHandling is * ULOC_STANDARD_NAMES. * * @param locale the display locale * @param dialectHandling how to select names for locales * @return a ULocaleDisplayNames instance * @param pErrorCode the status code * @stable ICU 4.4 */ U_CAPI ULocaleDisplayNames * U_EXPORT2 uldn_open(const char * locale, UDialectHandling dialectHandling, UErrorCode *pErrorCode); /** * Closes a ULocaleDisplayNames instance obtained from uldn_open(). * @param ldn the ULocaleDisplayNames instance to be closed * @stable ICU 4.4 */ U_CAPI void U_EXPORT2 uldn_close(ULocaleDisplayNames *ldn); /* getters for state */ /** * Returns the locale used to determine the display names. This is * not necessarily the same locale passed to {@link #uldn_open}. * @param ldn the LocaleDisplayNames instance * @return the display locale * @stable ICU 4.4 */ U_CAPI const char * U_EXPORT2 uldn_getLocale(const ULocaleDisplayNames *ldn); /** * Returns the dialect handling used in the display names. * @param ldn the LocaleDisplayNames instance * @return the dialect handling enum * @stable ICU 4.4 */ U_CAPI UDialectHandling U_EXPORT2 uldn_getDialectHandling(const ULocaleDisplayNames *ldn); /* names for entire locales */ /** * Returns the display name of the provided locale. * @param ldn the LocaleDisplayNames instance * @param locale the locale whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 uldn_localeDisplayName(const ULocaleDisplayNames *ldn, const char *locale, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /* names for components of a locale */ /** * Returns the display name of the provided language code. * @param ldn the LocaleDisplayNames instance * @param lang the language code whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 uldn_languageDisplayName(const ULocaleDisplayNames *ldn, const char *lang, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided script. * @param ldn the LocaleDisplayNames instance * @param script the script whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 uldn_scriptDisplayName(const ULocaleDisplayNames *ldn, const char *script, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided script code. * @param ldn the LocaleDisplayNames instance * @param scriptCode the script code whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 uldn_scriptCodeDisplayName(const ULocaleDisplayNames *ldn, UScriptCode scriptCode, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided region code. * @param ldn the LocaleDisplayNames instance * @param region the region code whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 uldn_regionDisplayName(const ULocaleDisplayNames *ldn, const char *region, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided variant * @param ldn the LocaleDisplayNames instance * @param variant the variant whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 uldn_variantDisplayName(const ULocaleDisplayNames *ldn, const char *variant, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided locale key * @param ldn the LocaleDisplayNames instance * @param key the locale key whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 uldn_keyDisplayName(const ULocaleDisplayNames *ldn, const char *key, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided value (used with the provided key). * @param ldn the LocaleDisplayNames instance * @param key the locale key * @param value the locale key's value * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 uldn_keyValueDisplayName(const ULocaleDisplayNames *ldn, const char *key, const char *value, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns an instance of LocaleDisplayNames that returns names formatted * for the provided locale, using the provided UDisplayContext settings. * * @param locale The display locale * @param contexts List of one or more context settings (e.g. for dialect * handling, capitalization, etc. * @param length Number of items in the contexts list * @param pErrorCode Pointer to UErrorCode input/output status. If at entry this indicates * a failure status, the function will do nothing; otherwise this will be * updated with any new status from the function. * @return a ULocaleDisplayNames instance * @stable ICU 51 */ U_CAPI ULocaleDisplayNames * U_EXPORT2 uldn_openForContext(const char * locale, UDisplayContext *contexts, int32_t length, UErrorCode *pErrorCode); /** * Returns the UDisplayContext value for the specified UDisplayContextType. * @param ldn the ULocaleDisplayNames instance * @param type the UDisplayContextType whose value to return * @param pErrorCode Pointer to UErrorCode input/output status. If at entry this indicates * a failure status, the function will do nothing; otherwise this will be * updated with any new status from the function. * @return the UDisplayContextValue for the specified type. * @stable ICU 51 */ U_CAPI UDisplayContext U_EXPORT2 uldn_getContext(const ULocaleDisplayNames *ldn, UDisplayContextType type, UErrorCode *pErrorCode); #endif /* !UCONFIG_NO_FORMATTING */ #endif /* __ULDNAMES_H__ */ // ucurr.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #ifndef _UCURR_H_ #define _UCURR_H_ /** * \file * \brief C API: Encapsulates information about a currency. * * The ucurr API encapsulates information about a currency, as defined by * ISO 4217. A currency is represented by a 3-character string * containing its ISO 4217 code. This API can return various data * necessary the proper display of a currency: * *

  • A display symbol, for a specific locale *
  • The number of fraction digits to display *
  • A rounding increment *
* * The DecimalFormat class uses these data to display * currencies. * @author Alan Liu * @since ICU 2.2 */ #if !UCONFIG_NO_FORMATTING /** * Currency Usage used for Decimal Format * @stable ICU 54 */ enum UCurrencyUsage { /** * a setting to specify currency usage which determines currency digit * and rounding for standard usage, for example: "50.00 NT$" * used as DEFAULT value * @stable ICU 54 */ UCURR_USAGE_STANDARD=0, /** * a setting to specify currency usage which determines currency digit * and rounding for cash usage, for example: "50 NT$" * @stable ICU 54 */ UCURR_USAGE_CASH=1, }; /** Currency Usage used for Decimal Format */ typedef enum UCurrencyUsage UCurrencyUsage; /** * Finds a currency code for the given locale. * @param locale the locale for which to retrieve a currency code. * Currency can be specified by the "currency" keyword * in which case it overrides the default currency code * @param buff fill in buffer. Can be NULL for preflighting. * @param buffCapacity capacity of the fill in buffer. Can be 0 for * preflighting. If it is non-zero, the buff parameter * must not be NULL. * @param ec error code * @return length of the currency string. It should always be 3. If 0, * currency couldn't be found or the input values are * invalid. * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 ucurr_forLocale(const char* locale, UChar* buff, int32_t buffCapacity, UErrorCode* ec); /** * Selector constants for ucurr_getName(). * * @see ucurr_getName * @stable ICU 2.6 */ typedef enum UCurrNameStyle { /** * Selector for ucurr_getName indicating a symbolic name for a * currency, such as "$" for USD. * @stable ICU 2.6 */ UCURR_SYMBOL_NAME, /** * Selector for ucurr_getName indicating the long name for a * currency, such as "US Dollar" for USD. * @stable ICU 2.6 */ UCURR_LONG_NAME, #if (NTDDI_VERSION >= NTDDI_WIN10_VB) /** * Selector for getName() indicating the narrow currency symbol. * The narrow currency symbol is similar to the regular currency * symbol, but it always takes the shortest form: for example, * "$" instead of "US$" for USD in en-CA. * * @stable ICU 61 */ UCURR_NARROW_SYMBOL_NAME, #endif // (NTDDI_VERSION >= NTDDI_WIN10_VB) #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Selector for getName() indicating the variant currency symbol. * The variant symbol for a currency is an alternative symbol * that is not necessarily as widely used as the regular symbol. * * @stable ICU 68 */ UCURR_VARIANT_SYMBOL_NAME #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) } UCurrNameStyle; #if !UCONFIG_NO_SERVICE /** * @stable ICU 2.6 */ typedef const void* UCurrRegistryKey; /** * Register an (existing) ISO 4217 currency code for the given locale. * Only the country code and the two variants EURO and PRE_EURO are * recognized. * @param isoCode the three-letter ISO 4217 currency code * @param locale the locale for which to register this currency code * @param status the in/out status code * @return a registry key that can be used to unregister this currency code, or NULL * if there was an error. * @stable ICU 2.6 */ U_CAPI UCurrRegistryKey U_EXPORT2 ucurr_register(const UChar* isoCode, const char* locale, UErrorCode* status); /** * Unregister the previously-registered currency definitions using the * URegistryKey returned from ucurr_register. Key becomes invalid after * a successful call and should not be used again. Any currency * that might have been hidden by the original ucurr_register call is * restored. * @param key the registry key returned by a previous call to ucurr_register * @param status the in/out status code, no special meanings are assigned * @return true if the currency for this key was successfully unregistered * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 ucurr_unregister(UCurrRegistryKey key, UErrorCode* status); #endif /* UCONFIG_NO_SERVICE */ /** * Returns the display name for the given currency in the * given locale. For example, the display name for the USD * currency object in the en_US locale is "$". * @param currency null-terminated 3-letter ISO 4217 code * @param locale locale in which to display currency * @param nameStyle selector for which kind of name to return * @param isChoiceFormat always set to false, or can be NULL; * display names are static strings; * since ICU 4.4, ChoiceFormat patterns are no longer supported * @param len fill-in parameter to receive length of result * @param ec error code * @return pointer to display string of 'len' UChars. If the resource * data contains no entry for 'currency', then 'currency' itself is * returned. * @stable ICU 2.6 */ U_CAPI const UChar* U_EXPORT2 ucurr_getName(const UChar* currency, const char* locale, UCurrNameStyle nameStyle, UBool* isChoiceFormat, int32_t* len, UErrorCode* ec); /** * Returns the plural name for the given currency in the * given locale. For example, the plural name for the USD * currency object in the en_US locale is "US dollar" or "US dollars". * @param currency null-terminated 3-letter ISO 4217 code * @param locale locale in which to display currency * @param isChoiceFormat always set to false, or can be NULL; * display names are static strings; * since ICU 4.4, ChoiceFormat patterns are no longer supported * @param pluralCount plural count * @param len fill-in parameter to receive length of result * @param ec error code * @return pointer to display string of 'len' UChars. If the resource * data contains no entry for 'currency', then 'currency' itself is * returned. * @stable ICU 4.2 */ U_CAPI const UChar* U_EXPORT2 ucurr_getPluralName(const UChar* currency, const char* locale, UBool* isChoiceFormat, const char* pluralCount, int32_t* len, UErrorCode* ec); /** * Returns the number of the number of fraction digits that should * be displayed for the given currency. * This is equivalent to ucurr_getDefaultFractionDigitsForUsage(currency,UCURR_USAGE_STANDARD,ec); * * Important: The number of fraction digits for a given currency is NOT * guaranteed to be constant across versions of ICU or CLDR. For example, * do NOT use this value as a mechanism for deciding the magnitude used * to store currency values in a database. You should use this value for * display purposes only. * * @param currency null-terminated 3-letter ISO 4217 code * @param ec input-output error code * @return a non-negative number of fraction digits to be * displayed, or 0 if there is an error * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 ucurr_getDefaultFractionDigits(const UChar* currency, UErrorCode* ec); /** * Returns the number of the number of fraction digits that should * be displayed for the given currency with usage. * * Important: The number of fraction digits for a given currency is NOT * guaranteed to be constant across versions of ICU or CLDR. For example, * do NOT use this value as a mechanism for deciding the magnitude used * to store currency values in a database. You should use this value for * display purposes only. * * @param currency null-terminated 3-letter ISO 4217 code * @param usage enum usage for the currency * @param ec input-output error code * @return a non-negative number of fraction digits to be * displayed, or 0 if there is an error * @stable ICU 54 */ U_CAPI int32_t U_EXPORT2 ucurr_getDefaultFractionDigitsForUsage(const UChar* currency, const UCurrencyUsage usage, UErrorCode* ec); /** * Returns the rounding increment for the given currency, or 0.0 if no * rounding is done by the currency. * This is equivalent to ucurr_getRoundingIncrementForUsage(currency,UCURR_USAGE_STANDARD,ec); * @param currency null-terminated 3-letter ISO 4217 code * @param ec input-output error code * @return the non-negative rounding increment, or 0.0 if none, * or 0.0 if there is an error * @stable ICU 3.0 */ U_CAPI double U_EXPORT2 ucurr_getRoundingIncrement(const UChar* currency, UErrorCode* ec); /** * Returns the rounding increment for the given currency, or 0.0 if no * rounding is done by the currency given usage. * @param currency null-terminated 3-letter ISO 4217 code * @param usage enum usage for the currency * @param ec input-output error code * @return the non-negative rounding increment, or 0.0 if none, * or 0.0 if there is an error * @stable ICU 54 */ U_CAPI double U_EXPORT2 ucurr_getRoundingIncrementForUsage(const UChar* currency, const UCurrencyUsage usage, UErrorCode* ec); /** * Selector constants for ucurr_openCurrencies(). * * @see ucurr_openCurrencies * @stable ICU 3.2 */ typedef enum UCurrCurrencyType { /** * Select all ISO-4217 currency codes. * @stable ICU 3.2 */ UCURR_ALL = INT32_MAX, /** * Select only ISO-4217 commonly used currency codes. * These currencies can be found in common use, and they usually have * bank notes or coins associated with the currency code. * This does not include fund codes, precious metals and other * various ISO-4217 codes limited to special financial products. * @stable ICU 3.2 */ UCURR_COMMON = 1, /** * Select ISO-4217 uncommon currency codes. * These codes respresent fund codes, precious metals and other * various ISO-4217 codes limited to special financial products. * A fund code is a monetary resource associated with a currency. * @stable ICU 3.2 */ UCURR_UNCOMMON = 2, /** * Select only deprecated ISO-4217 codes. * These codes are no longer in general public use. * @stable ICU 3.2 */ UCURR_DEPRECATED = 4, /** * Select only non-deprecated ISO-4217 codes. * These codes are in general public use. * @stable ICU 3.2 */ UCURR_NON_DEPRECATED = 8 } UCurrCurrencyType; /** * Provides a UEnumeration object for listing ISO-4217 codes. * @param currType You can use one of several UCurrCurrencyType values for this * variable. You can also | (or) them together to get a specific list of * currencies. Most people will want to use the (UCURR_COMMON|UCURR_NON_DEPRECATED) value to * get a list of current currencies. * @param pErrorCode Error code * @stable ICU 3.2 */ U_CAPI UEnumeration * U_EXPORT2 ucurr_openISOCurrencies(uint32_t currType, UErrorCode *pErrorCode); /** * Queries if the given ISO 4217 3-letter code is available on the specified date range. * * Note: For checking availability of a currency on a specific date, specify the date on both 'from' and 'to' * * When 'from' is U_DATE_MIN and 'to' is U_DATE_MAX, this method checks if the specified currency is available any time. * If 'from' and 'to' are same UDate value, this method checks if the specified currency is available on that date. * * @param isoCode * The ISO 4217 3-letter code. * * @param from * The lower bound of the date range, inclusive. When 'from' is U_DATE_MIN, check the availability * of the currency any date before 'to' * * @param to * The upper bound of the date range, inclusive. When 'to' is U_DATE_MAX, check the availability of * the currency any date after 'from' * * @param errorCode * ICU error code * * @return true if the given ISO 4217 3-letter code is supported on the specified date range. * * @stable ICU 4.8 */ U_CAPI UBool U_EXPORT2 ucurr_isAvailable(const UChar* isoCode, UDate from, UDate to, UErrorCode* errorCode); /** * Finds the number of valid currency codes for the * given locale and date. * @param locale the locale for which to retrieve the * currency count. * @param date the date for which to retrieve the * currency count for the given locale. * @param ec error code * @return the number of currency codes for the * given locale and date. If 0, currency * codes couldn't be found for the input * values are invalid. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 ucurr_countCurrencies(const char* locale, UDate date, UErrorCode* ec); /** * Finds a currency code for the given locale and date * @param locale the locale for which to retrieve a currency code. * Currency can be specified by the "currency" keyword * in which case it overrides the default currency code * @param date the date for which to retrieve a currency code for * the given locale. * @param index the index within the available list of currency codes * for the given locale on the given date. * @param buff fill in buffer. Can be NULL for preflighting. * @param buffCapacity capacity of the fill in buffer. Can be 0 for * preflighting. If it is non-zero, the buff parameter * must not be NULL. * @param ec error code * @return length of the currency string. It should always be 3. * If 0, currency couldn't be found or the input values are * invalid. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 ucurr_forLocaleAndDate(const char* locale, UDate date, int32_t index, UChar* buff, int32_t buffCapacity, UErrorCode* ec); /** * Given a key and a locale, returns an array of string values in a preferred * order that would make a difference. These are all and only those values where * the open (creation) of the service with the locale formed from the input locale * plus input keyword and that value has different behavior than creation with the * input locale alone. * @param key one of the keys supported by this service. For now, only * "currency" is supported. * @param locale the locale * @param commonlyUsed if set to true it will return only commonly used values * with the given locale in preferred order. Otherwise, * it will return all the available values for the locale. * @param status error status * @return a string enumeration over keyword values for the given key and the locale. * @stable ICU 4.2 */ U_CAPI UEnumeration* U_EXPORT2 ucurr_getKeywordValuesForLocale(const char* key, const char* locale, UBool commonlyUsed, UErrorCode* status); /** * Returns the ISO 4217 numeric code for the currency. *

Note: If the ISO 4217 numeric code is not assigned for the currency or * the currency is unknown, this function returns 0. * * @param currency null-terminated 3-letter ISO 4217 code * @return The ISO 4217 numeric code of the currency * @stable ICU 49 */ U_CAPI int32_t U_EXPORT2 ucurr_getNumericCode(const UChar* currency); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // _UCURR_H_ #if (NTDDI_VERSION >= NTDDI_WIN10_CO) // ucpmap.h // Copyright (C) 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // ucpmap.h // created: 2018sep03 Markus W. Scherer #ifndef __UCPMAP_H__ #define __UCPMAP_H__ U_CDECL_BEGIN /** * \file * * This file defines an abstract map from Unicode code points to integer values. * * @see UCPMap * @see UCPTrie * @see UMutableCPTrie */ /** * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values. * * @see UCPTrie * @see UMutableCPTrie * @stable ICU 63 */ typedef struct UCPMap UCPMap; /** * Selectors for how ucpmap_getRange() etc. should report value ranges overlapping with surrogates. * Most users should use UCPMAP_RANGE_NORMAL. * * @see ucpmap_getRange * @see ucptrie_getRange * @see umutablecptrie_getRange * @stable ICU 63 */ enum UCPMapRangeOption { /** * ucpmap_getRange() enumerates all same-value ranges as stored in the map. * Most users should use this option. * @stable ICU 63 */ UCPMAP_RANGE_NORMAL, /** * ucpmap_getRange() enumerates all same-value ranges as stored in the map, * except that lead surrogates (U+D800..U+DBFF) are treated as having the * surrogateValue, which is passed to getRange() as a separate parameter. * The surrogateValue is not transformed via filter(). * See U_IS_LEAD(c). * * Most users should use UCPMAP_RANGE_NORMAL instead. * * This option is useful for maps that map surrogate code *units* to * special values optimized for UTF-16 string processing * or for special error behavior for unpaired surrogates, * but those values are not to be associated with the lead surrogate code *points*. * @stable ICU 63 */ UCPMAP_RANGE_FIXED_LEAD_SURROGATES, /** * ucpmap_getRange() enumerates all same-value ranges as stored in the map, * except that all surrogates (U+D800..U+DFFF) are treated as having the * surrogateValue, which is passed to getRange() as a separate parameter. * The surrogateValue is not transformed via filter(). * See U_IS_SURROGATE(c). * * Most users should use UCPMAP_RANGE_NORMAL instead. * * This option is useful for maps that map surrogate code *units* to * special values optimized for UTF-16 string processing * or for special error behavior for unpaired surrogates, * but those values are not to be associated with the lead surrogate code *points*. * @stable ICU 63 */ UCPMAP_RANGE_FIXED_ALL_SURROGATES }; #ifndef U_IN_DOXYGEN typedef enum UCPMapRangeOption UCPMapRangeOption; #endif /** * Returns the value for a code point as stored in the map, with range checking. * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF. * * @param map the map * @param c the code point * @return the map value, * or an implementation-defined error value if the code point is not in the range 0..U+10FFFF * @stable ICU 63 */ U_CAPI uint32_t U_EXPORT2 ucpmap_get(const UCPMap *map, UChar32 c); /** * Callback function type: Modifies a map value. * Optionally called by ucpmap_getRange()/ucptrie_getRange()/umutablecptrie_getRange(). * The modified value will be returned by the getRange function. * * Can be used to ignore some of the value bits, * make a filter for one of several values, * return a value index computed from the map value, etc. * * @param context an opaque pointer, as passed into the getRange function * @param value a value from the map * @return the modified value * @stable ICU 63 */ typedef uint32_t U_CALLCONV UCPMapValueFilter(const void *context, uint32_t value); /** * Returns the last code point such that all those from start to there have the same value. * Can be used to efficiently iterate over all same-value ranges in a map. * (This is normally faster than iterating over code points and get()ting each value, * but much slower than a data structure that stores ranges directly.) * * If the UCPMapValueFilter function pointer is not NULL, then * the value to be delivered is passed through that function, and the return value is the end * of the range where all values are modified to the same actual value. * The value is unchanged if that function pointer is NULL. * * Example: * \code * UChar32 start = 0, end; * uint32_t value; * while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, * NULL, NULL, &value)) >= 0) { * // Work with the range start..end and its value. * start = end + 1; * } * \endcode * * @param map the map * @param start range start * @param option defines whether surrogates are treated normally, * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL * @param filter a pointer to a function that may modify the map data value, * or NULL if the values from the map are to be used unmodified * @param context an opaque pointer that is passed on to the filter function * @param pValue if not NULL, receives the value that every code point start..end has; * may have been modified by filter(context, map value) * if that function pointer is not NULL * @return the range end code point, or -1 if start is not a valid code point * @stable ICU 63 */ U_CAPI UChar32 U_EXPORT2 ucpmap_getRange(const UCPMap *map, UChar32 start, UCPMapRangeOption option, uint32_t surrogateValue, UCPMapValueFilter *filter, const void *context, uint32_t *pValue); U_CDECL_END #endif // __UCPMAP_H__ // ucptrie.h // Copyright (C) 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // ucptrie.h (modified from utrie2.h) // created: 2017dec29 Markus W. Scherer #ifndef __UCPTRIE_H__ #define __UCPTRIE_H__ U_CDECL_BEGIN /** * \file * \brief C API: This file defines an immutable Unicode code point trie. * * This file defines an immutable Unicode code point trie. * * @see UCPTrie * @see UMutableCPTrie */ #ifndef U_IN_DOXYGEN /** @internal */ typedef union UCPTrieData { /** @internal */ const void *ptr0; /** @internal */ const uint16_t *ptr16; /** @internal */ const uint32_t *ptr32; /** @internal */ const uint8_t *ptr8; } UCPTrieData; #endif /** * Immutable Unicode code point trie structure. * Fast, reasonably compact, map from Unicode code points (U+0000..U+10FFFF) to integer values. * For details see https://icu.unicode.org/design/struct/utrie * * Do not access UCPTrie fields directly; use public functions and macros. * Functions are easy to use: They support all trie types and value widths. * * When performance is really important, macros provide faster access. * Most macros are specific to either "fast" or "small" tries, see UCPTrieType. * There are "fast" macros for special optimized use cases. * * The macros will return bogus values, or may crash, if used on the wrong type or value width. * * @see UMutableCPTrie * @stable ICU 63 */ struct UCPTrie { #ifndef U_IN_DOXYGEN /** @internal */ const uint16_t *index; /** @internal */ UCPTrieData data; /** @internal */ int32_t indexLength; /** @internal */ int32_t dataLength; /** Start of the last range which ends at U+10FFFF. @internal */ UChar32 highStart; /** highStart>>12 @internal */ uint16_t shifted12HighStart; /** @internal */ int8_t type; // UCPTrieType /** @internal */ int8_t valueWidth; // UCPTrieValueWidth /** padding/reserved @internal */ uint32_t reserved32; /** padding/reserved @internal */ uint16_t reserved16; /** * Internal index-3 null block offset. * Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block. * @internal */ uint16_t index3NullOffset; /** * Internal data null block offset, not shifted. * Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block. * @internal */ int32_t dataNullOffset; /** @internal */ uint32_t nullValue; #ifdef UCPTRIE_DEBUG /** @internal */ const char *name; #endif #endif }; #ifndef U_IN_DOXYGEN typedef struct UCPTrie UCPTrie; #endif /** * Selectors for the type of a UCPTrie. * Different trade-offs for size vs. speed. * * @see umutablecptrie_buildImmutable * @see ucptrie_openFromBinary * @see ucptrie_getType * @stable ICU 63 */ enum UCPTrieType { /** * For ucptrie_openFromBinary() to accept any type. * ucptrie_getType() will return the actual type. * @stable ICU 63 */ UCPTRIE_TYPE_ANY = -1, /** * Fast/simple/larger BMP data structure. Use functions and "fast" macros. * @stable ICU 63 */ UCPTRIE_TYPE_FAST, /** * Small/slower BMP data structure. Use functions and "small" macros. * @stable ICU 63 */ UCPTRIE_TYPE_SMALL }; #ifndef U_IN_DOXYGEN typedef enum UCPTrieType UCPTrieType; #endif /** * Selectors for the number of bits in a UCPTrie data value. * * @see umutablecptrie_buildImmutable * @see ucptrie_openFromBinary * @see ucptrie_getValueWidth * @stable ICU 63 */ enum UCPTrieValueWidth { /** * For ucptrie_openFromBinary() to accept any data value width. * ucptrie_getValueWidth() will return the actual data value width. * @stable ICU 63 */ UCPTRIE_VALUE_BITS_ANY = -1, /** * The trie stores 16 bits per data value. * It returns them as unsigned values 0..0xffff=65535. * @stable ICU 63 */ UCPTRIE_VALUE_BITS_16, /** * The trie stores 32 bits per data value. * @stable ICU 63 */ UCPTRIE_VALUE_BITS_32, /** * The trie stores 8 bits per data value. * It returns them as unsigned values 0..0xff=255. * @stable ICU 63 */ UCPTRIE_VALUE_BITS_8 }; #ifndef U_IN_DOXYGEN typedef enum UCPTrieValueWidth UCPTrieValueWidth; #endif /** * Opens a trie from its binary form, stored in 32-bit-aligned memory. * Inverse of ucptrie_toBinary(). * * The memory must remain valid and unchanged as long as the trie is used. * You must ucptrie_close() the trie once you are done using it. * * @param type selects the trie type; results in an * U_INVALID_FORMAT_ERROR if it does not match the binary data; * use UCPTRIE_TYPE_ANY to accept any type * @param valueWidth selects the number of bits in a data value; results in an * U_INVALID_FORMAT_ERROR if it does not match the binary data; * use UCPTRIE_VALUE_BITS_ANY to accept any data value width * @param data a pointer to 32-bit-aligned memory containing the binary data of a UCPTrie * @param length the number of bytes available at data; * can be more than necessary * @param pActualLength receives the actual number of bytes at data taken up by the trie data; * can be NULL * @param pErrorCode an in/out ICU UErrorCode * @return the trie * * @see umutablecptrie_open * @see umutablecptrie_buildImmutable * @see ucptrie_toBinary * @stable ICU 63 */ U_CAPI UCPTrie * U_EXPORT2 ucptrie_openFromBinary(UCPTrieType type, UCPTrieValueWidth valueWidth, const void *data, int32_t length, int32_t *pActualLength, UErrorCode *pErrorCode); /** * Closes a trie and releases associated memory. * * @param trie the trie * @stable ICU 63 */ U_CAPI void U_EXPORT2 ucptrie_close(UCPTrie *trie); /** * Returns the trie type. * * @param trie the trie * @return the trie type * @see ucptrie_openFromBinary * @see UCPTRIE_TYPE_ANY * @stable ICU 63 */ U_CAPI UCPTrieType U_EXPORT2 ucptrie_getType(const UCPTrie *trie); /** * Returns the number of bits in a trie data value. * * @param trie the trie * @return the number of bits in a trie data value * @see ucptrie_openFromBinary * @see UCPTRIE_VALUE_BITS_ANY * @stable ICU 63 */ U_CAPI UCPTrieValueWidth U_EXPORT2 ucptrie_getValueWidth(const UCPTrie *trie); /** * Returns the value for a code point as stored in the trie, with range checking. * Returns the trie error value if c is not in the range 0..U+10FFFF. * * Easier to use than UCPTRIE_FAST_GET() and similar macros but slower. * Easier to use because, unlike the macros, this function works on all UCPTrie * objects, for all types and value widths. * * @param trie the trie * @param c the code point * @return the trie value, * or the trie error value if the code point is not in the range 0..U+10FFFF * @stable ICU 63 */ U_CAPI uint32_t U_EXPORT2 ucptrie_get(const UCPTrie *trie, UChar32 c); /** * Returns the last code point such that all those from start to there have the same value. * Can be used to efficiently iterate over all same-value ranges in a trie. * (This is normally faster than iterating over code points and get()ting each value, * but much slower than a data structure that stores ranges directly.) * * If the UCPMapValueFilter function pointer is not NULL, then * the value to be delivered is passed through that function, and the return value is the end * of the range where all values are modified to the same actual value. * The value is unchanged if that function pointer is NULL. * * Example: * \code * UChar32 start = 0, end; * uint32_t value; * while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, * NULL, NULL, &value)) >= 0) { * // Work with the range start..end and its value. * start = end + 1; * } * \endcode * * @param trie the trie * @param start range start * @param option defines whether surrogates are treated normally, * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL * @param filter a pointer to a function that may modify the trie data value, * or NULL if the values from the trie are to be used unmodified * @param context an opaque pointer that is passed on to the filter function * @param pValue if not NULL, receives the value that every code point start..end has; * may have been modified by filter(context, trie value) * if that function pointer is not NULL * @return the range end code point, or -1 if start is not a valid code point * @stable ICU 63 */ U_CAPI UChar32 U_EXPORT2 ucptrie_getRange(const UCPTrie *trie, UChar32 start, UCPMapRangeOption option, uint32_t surrogateValue, UCPMapValueFilter *filter, const void *context, uint32_t *pValue); /** * Writes a memory-mappable form of the trie into 32-bit aligned memory. * Inverse of ucptrie_openFromBinary(). * * @param trie the trie * @param data a pointer to 32-bit-aligned memory to be filled with the trie data; * can be NULL if capacity==0 * @param capacity the number of bytes available at data, or 0 for pure preflighting * @param pErrorCode an in/out ICU UErrorCode; * U_BUFFER_OVERFLOW_ERROR if the capacity is too small * @return the number of bytes written or (if buffer overflow) needed for the trie * * @see ucptrie_openFromBinary() * @stable ICU 63 */ U_CAPI int32_t U_EXPORT2 ucptrie_toBinary(const UCPTrie *trie, void *data, int32_t capacity, UErrorCode *pErrorCode); /** * Macro parameter value for a trie with 16-bit data values. * Use the name of this macro as a "dataAccess" parameter in other macros. * Do not use this macro in any other way. * * @see UCPTRIE_VALUE_BITS_16 * @stable ICU 63 */ #define UCPTRIE_16(trie, i) ((trie)->data.ptr16[i]) /** * Macro parameter value for a trie with 32-bit data values. * Use the name of this macro as a "dataAccess" parameter in other macros. * Do not use this macro in any other way. * * @see UCPTRIE_VALUE_BITS_32 * @stable ICU 63 */ #define UCPTRIE_32(trie, i) ((trie)->data.ptr32[i]) /** * Macro parameter value for a trie with 8-bit data values. * Use the name of this macro as a "dataAccess" parameter in other macros. * Do not use this macro in any other way. * * @see UCPTRIE_VALUE_BITS_8 * @stable ICU 63 */ #define UCPTRIE_8(trie, i) ((trie)->data.ptr8[i]) /** * Returns a trie value for a code point, with range checking. * Returns the trie error value if c is not in the range 0..U+10FFFF. * * @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param c (UChar32, in) the input code point * @return The code point's trie value. * @stable ICU 63 */ #define UCPTRIE_FAST_GET(trie, dataAccess, c) dataAccess(trie, _UCPTRIE_CP_INDEX(trie, 0xffff, c)) /** * Returns a 16-bit trie value for a code point, with range checking. * Returns the trie error value if c is not in the range U+0000..U+10FFFF. * * @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_SMALL * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param c (UChar32, in) the input code point * @return The code point's trie value. * @stable ICU 63 */ #define UCPTRIE_SMALL_GET(trie, dataAccess, c) \ dataAccess(trie, _UCPTRIE_CP_INDEX(trie, UCPTRIE_SMALL_MAX, c)) /** * UTF-16: Reads the next code point (UChar32 c, out), post-increments src, * and gets a value from the trie. * Sets the trie error value if c is an unpaired surrogate. * * @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param src (const UChar *, in/out) the source text pointer * @param limit (const UChar *, in) the limit pointer for the text, or NULL if NUL-terminated * @param c (UChar32, out) variable for the code point * @param result (out) variable for the trie lookup result * @stable ICU 63 */ #define UCPTRIE_FAST_U16_NEXT(trie, dataAccess, src, limit, c, result) UPRV_BLOCK_MACRO_BEGIN { \ (c) = *(src)++; \ int32_t __index; \ if (!U16_IS_SURROGATE(c)) { \ __index = _UCPTRIE_FAST_INDEX(trie, c); \ } else { \ uint16_t __c2; \ if (U16_IS_SURROGATE_LEAD(c) && (src) != (limit) && U16_IS_TRAIL(__c2 = *(src))) { \ ++(src); \ (c) = U16_GET_SUPPLEMENTARY((c), __c2); \ __index = _UCPTRIE_SMALL_INDEX(trie, c); \ } else { \ __index = (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; \ } \ } \ (result) = dataAccess(trie, __index); \ } UPRV_BLOCK_MACRO_END /** * UTF-16: Reads the previous code point (UChar32 c, out), pre-decrements src, * and gets a value from the trie. * Sets the trie error value if c is an unpaired surrogate. * * @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param start (const UChar *, in) the start pointer for the text * @param src (const UChar *, in/out) the source text pointer * @param c (UChar32, out) variable for the code point * @param result (out) variable for the trie lookup result * @stable ICU 63 */ #define UCPTRIE_FAST_U16_PREV(trie, dataAccess, start, src, c, result) UPRV_BLOCK_MACRO_BEGIN { \ (c) = *--(src); \ int32_t __index; \ if (!U16_IS_SURROGATE(c)) { \ __index = _UCPTRIE_FAST_INDEX(trie, c); \ } else { \ uint16_t __c2; \ if (U16_IS_SURROGATE_TRAIL(c) && (src) != (start) && U16_IS_LEAD(__c2 = *((src) - 1))) { \ --(src); \ (c) = U16_GET_SUPPLEMENTARY(__c2, (c)); \ __index = _UCPTRIE_SMALL_INDEX(trie, c); \ } else { \ __index = (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; \ } \ } \ (result) = dataAccess(trie, __index); \ } UPRV_BLOCK_MACRO_END /** * UTF-8: Post-increments src and gets a value from the trie. * Sets the trie error value for an ill-formed byte sequence. * * Unlike UCPTRIE_FAST_U16_NEXT() this UTF-8 macro does not provide the code point * because it would be more work to do so and is often not needed. * If the trie value differs from the error value, then the byte sequence is well-formed, * and the code point can be assembled without revalidation. * * @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param src (const char *, in/out) the source text pointer * @param limit (const char *, in) the limit pointer for the text (must not be NULL) * @param result (out) variable for the trie lookup result * @stable ICU 63 */ #define UCPTRIE_FAST_U8_NEXT(trie, dataAccess, src, limit, result) UPRV_BLOCK_MACRO_BEGIN { \ int32_t __lead = (uint8_t)*(src)++; \ if (!U8_IS_SINGLE(__lead)) { \ uint8_t __t1, __t2, __t3; \ if ((src) != (limit) && \ (__lead >= 0xe0 ? \ __lead < 0xf0 ? /* U+0800..U+FFFF except surrogates */ \ U8_LEAD3_T1_BITS[__lead &= 0xf] & (1 << ((__t1 = *(src)) >> 5)) && \ ++(src) != (limit) && (__t2 = *(src) - 0x80) <= 0x3f && \ (__lead = ((int32_t)(trie)->index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) \ : /* U+10000..U+10FFFF */ \ (__lead -= 0xf0) <= 4 && \ U8_LEAD4_T1_BITS[(__t1 = *(src)) >> 4] & (1 << __lead) && \ (__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) != (limit)) && \ (__t2 = *(src) - 0x80) <= 0x3f && \ ++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f && \ (__lead = __lead >= (trie)->shifted12HighStart ? \ (trie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : \ ucptrie_internalSmallU8Index((trie), __lead, __t2, __t3), 1) \ : /* U+0080..U+07FF */ \ __lead >= 0xc2 && (__t1 = *(src) - 0x80) <= 0x3f && \ (__lead = (int32_t)(trie)->index[__lead & 0x1f] + __t1, 1))) { \ ++(src); \ } else { \ __lead = (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; /* ill-formed*/ \ } \ } \ (result) = dataAccess(trie, __lead); \ } UPRV_BLOCK_MACRO_END /** * UTF-8: Pre-decrements src and gets a value from the trie. * Sets the trie error value for an ill-formed byte sequence. * * Unlike UCPTRIE_FAST_U16_PREV() this UTF-8 macro does not provide the code point * because it would be more work to do so and is often not needed. * If the trie value differs from the error value, then the byte sequence is well-formed, * and the code point can be assembled without revalidation. * * @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param start (const char *, in) the start pointer for the text * @param src (const char *, in/out) the source text pointer * @param result (out) variable for the trie lookup result * @stable ICU 63 */ #define UCPTRIE_FAST_U8_PREV(trie, dataAccess, start, src, result) UPRV_BLOCK_MACRO_BEGIN { \ int32_t __index = (uint8_t)*--(src); \ if (!U8_IS_SINGLE(__index)) { \ __index = ucptrie_internalU8PrevIndex((trie), __index, (const uint8_t *)(start), \ (const uint8_t *)(src)); \ (src) -= __index & 7; \ __index >>= 3; \ } \ (result) = dataAccess(trie, __index); \ } UPRV_BLOCK_MACRO_END /** * Returns a trie value for an ASCII code point, without range checking. * * @param trie (const UCPTrie *, in) the trie (of either fast or small type) * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param c (UChar32, in) the input code point; must be U+0000..U+007F * @return The ASCII code point's trie value. * @stable ICU 63 */ #define UCPTRIE_ASCII_GET(trie, dataAccess, c) dataAccess(trie, c) /** * Returns a trie value for a BMP code point (U+0000..U+FFFF), without range checking. * Can be used to look up a value for a UTF-16 code unit if other parts of * the string processing check for surrogates. * * @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param c (UChar32, in) the input code point, must be U+0000..U+FFFF * @return The BMP code point's trie value. * @stable ICU 63 */ #define UCPTRIE_FAST_BMP_GET(trie, dataAccess, c) dataAccess(trie, _UCPTRIE_FAST_INDEX(trie, c)) /** * Returns a trie value for a supplementary code point (U+10000..U+10FFFF), * without range checking. * * @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST * @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the trie’s value width * @param c (UChar32, in) the input code point, must be U+10000..U+10FFFF * @return The supplementary code point's trie value. * @stable ICU 63 */ #define UCPTRIE_FAST_SUPP_GET(trie, dataAccess, c) dataAccess(trie, _UCPTRIE_SMALL_INDEX(trie, c)) /* Internal definitions ----------------------------------------------------- */ #ifndef U_IN_DOXYGEN /** * Internal implementation constants. * These are needed for the API macros, but users should not use these directly. * @internal */ enum { /** @internal */ UCPTRIE_FAST_SHIFT = 6, /** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */ UCPTRIE_FAST_DATA_BLOCK_LENGTH = 1 << UCPTRIE_FAST_SHIFT, /** Mask for getting the lower bits for the in-fast-data-block offset. @internal */ UCPTRIE_FAST_DATA_MASK = UCPTRIE_FAST_DATA_BLOCK_LENGTH - 1, /** @internal */ UCPTRIE_SMALL_MAX = 0xfff, /** * Offset from dataLength (to be subtracted) for fetching the * value returned for out-of-range code points and ill-formed UTF-8/16. * @internal */ UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET = 1, /** * Offset from dataLength (to be subtracted) for fetching the * value returned for code points highStart..U+10FFFF. * @internal */ UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET = 2 }; /* Internal functions and macros -------------------------------------------- */ // Do not conditionalize with #ifndef U_HIDE_INTERNAL_API, needed for public API /** @internal */ U_CAPI int32_t U_EXPORT2 ucptrie_internalSmallIndex(const UCPTrie *trie, UChar32 c); /** @internal */ U_CAPI int32_t U_EXPORT2 ucptrie_internalSmallU8Index(const UCPTrie *trie, int32_t lt1, uint8_t t2, uint8_t t3); /** * Internal function for part of the UCPTRIE_FAST_U8_PREVxx() macro implementations. * Do not call directly. * @internal */ U_CAPI int32_t U_EXPORT2 ucptrie_internalU8PrevIndex(const UCPTrie *trie, UChar32 c, const uint8_t *start, const uint8_t *src); /** Internal trie getter for a code point below the fast limit. Returns the data index. @internal */ #define _UCPTRIE_FAST_INDEX(trie, c) \ ((int32_t)(trie)->index[(c) >> UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK)) /** Internal trie getter for a code point at or above the fast limit. Returns the data index. @internal */ #define _UCPTRIE_SMALL_INDEX(trie, c) \ ((c) >= (trie)->highStart ? \ (trie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : \ ucptrie_internalSmallIndex(trie, c)) /** * Internal trie getter for a code point, with checking that c is in U+0000..10FFFF. * Returns the data index. * @internal */ #define _UCPTRIE_CP_INDEX(trie, fastMax, c) \ ((uint32_t)(c) <= (uint32_t)(fastMax) ? \ _UCPTRIE_FAST_INDEX(trie, c) : \ (uint32_t)(c) <= 0x10ffff ? \ _UCPTRIE_SMALL_INDEX(trie, c) : \ (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET) U_CDECL_END #endif // U_IN_DOXYGEN #endif // __UCPTRIE_H__ // umutablecptrie.h // Copyright (C) 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // umutablecptrie.h (split out of ucptrie.h) // created: 2018jan24 Markus W. Scherer #ifndef __UMUTABLECPTRIE_H__ #define __UMUTABLECPTRIE_H__ U_CDECL_BEGIN /** * \file * * This file defines a mutable Unicode code point trie. * * @see UCPTrie * @see UMutableCPTrie */ /** * Mutable Unicode code point trie. * Fast map from Unicode code points (U+0000..U+10FFFF) to 32-bit integer values. * For details see https://icu.unicode.org/design/struct/utrie * * Setting values (especially ranges) and lookup is fast. * The mutable trie is only somewhat space-efficient. * It builds a compacted, immutable UCPTrie. * * This trie can be modified while iterating over its contents. * For example, it is possible to merge its values with those from another * set of ranges (e.g., another mutable or immutable trie): * Iterate over those source ranges; for each of them iterate over this trie; * add the source value into the value of each trie range. * * @see UCPTrie * @see umutablecptrie_buildImmutable * @stable ICU 63 */ typedef struct UMutableCPTrie UMutableCPTrie; /** * Creates a mutable trie that initially maps each Unicode code point to the same value. * It uses 32-bit data values until umutablecptrie_buildImmutable() is called. * umutablecptrie_buildImmutable() takes a valueWidth parameter which * determines the number of bits in the data value in the resulting UCPTrie. * You must umutablecptrie_close() the trie once you are done using it. * * @param initialValue the initial value that is set for all code points * @param errorValue the value for out-of-range code points and ill-formed UTF-8/16 * @param pErrorCode an in/out ICU UErrorCode * @return the trie * @stable ICU 63 */ U_CAPI UMutableCPTrie * U_EXPORT2 umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode); /** * Clones a mutable trie. * You must umutablecptrie_close() the clone once you are done using it. * * @param other the trie to clone * @param pErrorCode an in/out ICU UErrorCode * @return the trie clone * @stable ICU 63 */ U_CAPI UMutableCPTrie * U_EXPORT2 umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode); /** * Closes a mutable trie and releases associated memory. * * @param trie the trie * @stable ICU 63 */ U_CAPI void U_EXPORT2 umutablecptrie_close(UMutableCPTrie *trie); /** * Creates a mutable trie with the same contents as the UCPMap. * You must umutablecptrie_close() the mutable trie once you are done using it. * * @param map the source map * @param pErrorCode an in/out ICU UErrorCode * @return the mutable trie * @stable ICU 63 */ U_CAPI UMutableCPTrie * U_EXPORT2 umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode); /** * Creates a mutable trie with the same contents as the immutable one. * You must umutablecptrie_close() the mutable trie once you are done using it. * * @param trie the immutable trie * @param pErrorCode an in/out ICU UErrorCode * @return the mutable trie * @stable ICU 63 */ U_CAPI UMutableCPTrie * U_EXPORT2 umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode); /** * Returns the value for a code point as stored in the trie. * * @param trie the trie * @param c the code point * @return the value * @stable ICU 63 */ U_CAPI uint32_t U_EXPORT2 umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c); /** * Returns the last code point such that all those from start to there have the same value. * Can be used to efficiently iterate over all same-value ranges in a trie. * (This is normally faster than iterating over code points and get()ting each value, * but much slower than a data structure that stores ranges directly.) * * The trie can be modified between calls to this function. * * If the UCPMapValueFilter function pointer is not NULL, then * the value to be delivered is passed through that function, and the return value is the end * of the range where all values are modified to the same actual value. * The value is unchanged if that function pointer is NULL. * * See the same-signature ucptrie_getRange() for a code sample. * * @param trie the trie * @param start range start * @param option defines whether surrogates are treated normally, * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL * @param filter a pointer to a function that may modify the trie data value, * or NULL if the values from the trie are to be used unmodified * @param context an opaque pointer that is passed on to the filter function * @param pValue if not NULL, receives the value that every code point start..end has; * may have been modified by filter(context, trie value) * if that function pointer is not NULL * @return the range end code point, or -1 if start is not a valid code point * @stable ICU 63 */ U_CAPI UChar32 U_EXPORT2 umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start, UCPMapRangeOption option, uint32_t surrogateValue, UCPMapValueFilter *filter, const void *context, uint32_t *pValue); /** * Sets a value for a code point. * * @param trie the trie * @param c the code point * @param value the value * @param pErrorCode an in/out ICU UErrorCode * @stable ICU 63 */ U_CAPI void U_EXPORT2 umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode); /** * Sets a value for each code point [start..end]. * Faster and more space-efficient than setting the value for each code point separately. * * @param trie the trie * @param start the first code point to get the value * @param end the last code point to get the value (inclusive) * @param value the value * @param pErrorCode an in/out ICU UErrorCode * @stable ICU 63 */ U_CAPI void U_EXPORT2 umutablecptrie_setRange(UMutableCPTrie *trie, UChar32 start, UChar32 end, uint32_t value, UErrorCode *pErrorCode); /** * Compacts the data and builds an immutable UCPTrie according to the parameters. * After this, the mutable trie will be empty. * * The mutable trie stores 32-bit values until buildImmutable() is called. * If values shorter than 32 bits are to be stored in the immutable trie, * then the upper bits are discarded. * For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581, * and the value width is 8 bits, then each of these is stored as 0x81 * and the immutable trie will return that as an unsigned value. * (Some implementations may want to make productive temporary use of the upper bits * until buildImmutable() discards them.) * * Not every possible set of mappings can be built into a UCPTrie, * because of limitations resulting from speed and space optimizations. * Every Unicode assigned character can be mapped to a unique value. * Typical data yields data structures far smaller than the limitations. * * It is possible to construct extremely unusual mappings that exceed the data structure limits. * In such a case this function will fail with a U_INDEX_OUTOFBOUNDS_ERROR. * * @param trie the trie trie * @param type selects the trie type * @param valueWidth selects the number of bits in a trie data value; if smaller than 32 bits, * then the values stored in the trie will be truncated first * @param pErrorCode an in/out ICU UErrorCode * * @see umutablecptrie_fromUCPTrie * @stable ICU 63 */ U_CAPI UCPTrie * U_EXPORT2 umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode *pErrorCode); U_CDECL_END #endif // __UMUTABLECPTRIE_H__ #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) // ucnv_err.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * * ucnv_err.h: */ /** * \file * \brief C API: UConverter predefined error callbacks * *

Error Behaviour Functions

* Defines some error behaviour functions called by ucnv_{from,to}Unicode * These are provided as part of ICU and many are stable, but they * can also be considered only as an example of what can be done with * callbacks. You may of course write your own. * * If you want to write your own, you may also find the functions from * ucnv_cb.h useful when writing your own callbacks. * * These functions, although public, should NEVER be called directly. * They should be used as parameters to the ucnv_setFromUCallback * and ucnv_setToUCallback functions, to set the behaviour of a converter * when it encounters ILLEGAL/UNMAPPED/INVALID sequences. * * usage example: 'STOP' doesn't need any context, but newContext * could be set to something other than 'NULL' if needed. The available * contexts in this header can modify the default behavior of the callback. * * \code * UErrorCode err = U_ZERO_ERROR; * UConverter *myConverter = ucnv_open("ibm-949", &err); * const void *oldContext; * UConverterFromUCallback oldAction; * * * if (U_SUCCESS(err)) * { * ucnv_setFromUCallBack(myConverter, * UCNV_FROM_U_CALLBACK_STOP, * NULL, * &oldAction, * &oldContext, * &status); * } * \endcode * * The code above tells "myConverter" to stop when it encounters an * ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from * Unicode -> Codepage. The behavior from Codepage to Unicode is not changed, * and ucnv_setToUCallBack would need to be called in order to change * that behavior too. * * Here is an example with a context: * * \code * UErrorCode err = U_ZERO_ERROR; * UConverter *myConverter = ucnv_open("ibm-949", &err); * const void *oldContext; * UConverterFromUCallback oldAction; * * * if (U_SUCCESS(err)) * { * ucnv_setToUCallBack(myConverter, * UCNV_TO_U_CALLBACK_SUBSTITUTE, * UCNV_SUB_STOP_ON_ILLEGAL, * &oldAction, * &oldContext, * &status); * } * \endcode * * The code above tells "myConverter" to stop when it encounters an * ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from * Codepage -> Unicode. Any unmapped and legal characters will be * substituted to be the default substitution character. */ #ifndef UCNV_ERR_H #define UCNV_ERR_H #if !UCONFIG_NO_CONVERSION /** Forward declaring the UConverter structure. @stable ICU 2.0 */ struct UConverter; /** @stable ICU 2.0 */ typedef struct UConverter UConverter; /** * FROM_U, TO_U context options for sub callback * @stable ICU 2.0 */ #define UCNV_SUB_STOP_ON_ILLEGAL "i" /** * FROM_U, TO_U context options for skip callback * @stable ICU 2.0 */ #define UCNV_SKIP_STOP_ON_ILLEGAL "i" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) * @stable ICU 2.0 */ #define UCNV_ESCAPE_ICU NULL /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) * @stable ICU 2.0 */ #define UCNV_ESCAPE_JAVA "J" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) * TO_U_CALLBACK_ESCAPE option to escape the character value according to C (\\xXXXX) * @stable ICU 2.0 */ #define UCNV_ESCAPE_C "C" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly * @stable ICU 2.0 */ #define UCNV_ESCAPE_XML_DEC "D" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly * @stable ICU 2.0 */ #define UCNV_ESCAPE_XML_HEX "X" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) * @stable ICU 2.0 */ #define UCNV_ESCAPE_UNICODE "U" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H, that is, * a backslash, 1..6 hex digits, and a space) * @stable ICU 4.0 */ #define UCNV_ESCAPE_CSS2 "S" /** * The process condition code to be used with the callbacks. * Codes which are greater than UCNV_IRREGULAR should be * passed on to any chained callbacks. * @stable ICU 2.0 */ typedef enum { UCNV_UNASSIGNED = 0, /**< The code point is unassigned. The error code U_INVALID_CHAR_FOUND will be set. */ UCNV_ILLEGAL = 1, /**< The code point is illegal. For example, \\x81\\x2E is illegal in SJIS because \\x2E is not a valid trail byte for the \\x81 lead byte. Also, starting with Unicode 3.0.1, non-shortest byte sequences in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061) are also illegal, not just irregular. The error code U_ILLEGAL_CHAR_FOUND will be set. */ UCNV_IRREGULAR = 2, /**< The codepoint is not a regular sequence in the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF are irregular UTF-8 byte sequences for single surrogate code points. The error code U_INVALID_CHAR_FOUND will be set. */ UCNV_RESET = 3, /**< The callback is called with this reason when a 'reset' has occurred. Callback should reset all state. */ UCNV_CLOSE = 4, /**< Called when the converter is closed. The callback should release any allocated memory.*/ UCNV_CLONE = 5 /**< Called when ucnv_safeClone() is called on the converter. the pointer available as the 'context' is an alias to the original converters' context pointer. If the context must be owned by the new converter, the callback must clone the data and call ucnv_setFromUCallback (or setToUCallback) with the correct pointer. @stable ICU 2.2 */ } UConverterCallbackReason; /** * The structure for the fromUnicode callback function parameter. * @stable ICU 2.0 */ typedef struct { uint16_t size; /**< The size of this struct. @stable ICU 2.0 */ UBool flush; /**< The internal state of converter will be reset and data flushed if set to true. @stable ICU 2.0 */ UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */ const UChar *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */ const UChar *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */ char *target; /**< Pointer to the target buffer. @stable ICU 2.0 */ const char *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */ int32_t *offsets; /**< Pointer to the buffer that receives the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */ } UConverterFromUnicodeArgs; /** * The structure for the toUnicode callback function parameter. * @stable ICU 2.0 */ typedef struct { uint16_t size; /**< The size of this struct @stable ICU 2.0 */ UBool flush; /**< The internal state of converter will be reset and data flushed if set to true. @stable ICU 2.0 */ UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */ const char *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */ const char *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */ UChar *target; /**< Pointer to the target buffer. @stable ICU 2.0 */ const UChar *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */ int32_t *offsets; /**< Pointer to the buffer that receives the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */ } UConverterToUnicodeArgs; /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This From Unicode callback STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * * @param context Pointer to the callback's private data * @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err This should always be set to a failure status prior to calling. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP ( const void *context, UConverterFromUnicodeArgs *fromUArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This To Unicode callback STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * * @param context Pointer to the callback's private data * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err This should always be set to a failure status prior to calling. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP ( const void *context, UConverterToUnicodeArgs *toUArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This From Unicode callback skips any ILLEGAL_SEQUENCE, or * skips only UNASSIGNED_SEQUENCE depending on the context parameter * simply ignoring those characters. * * @param context The function currently recognizes the callback options: * UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Skips any ILLEGAL_SEQUENCE * @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP ( const void *context, UConverterFromUnicodeArgs *fromUArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or * UNASSIGNED_SEQUENCE depending on context parameter, with the * current substitution string for the converter. This is the default * callback. * * @param context The function currently recognizes the callback options: * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Substitutes any ILLEGAL_SEQUENCE * @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @see ucnv_setSubstChars * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE ( const void *context, UConverterFromUnicodeArgs *fromUArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the * hexadecimal representation of the illegal codepoints * * @param context The function currently recognizes the callback options: *
    *
  • UCNV_ESCAPE_ICU: Substitutes the ILLEGAL SEQUENCE with the hexadecimal * representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE"). * In the Event the converter doesn't support the characters {%,U}[A-F][0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * %UD84D%UDC56
  • *
  • UCNV_ESCAPE_JAVA: Substitutes the ILLEGAL SEQUENCE with the hexadecimal * representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE"). * In the Event the converter doesn't support the characters {\,u}[A-F][0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * \\uD84D\\uDC56
  • *
  • UCNV_ESCAPE_C: Substitutes the ILLEGAL SEQUENCE with the hexadecimal * representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE"). * In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * \\U00023456
  • *
  • UCNV_ESCAPE_XML_DEC: Substitutes the ILLEGAL SEQUENCE with the decimal * representation in the format \htmlonly&#DDDDDDDD;, e.g. "&#65534;&#172;&#51454;")\endhtmlonly. * In the Event the converter doesn't support the characters {&,#}[0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * &#144470; and Zero padding is ignored.
  • *
  • UCNV_ESCAPE_XML_HEX:Substitutes the ILLEGAL SEQUENCE with the decimal * representation in the format \htmlonly&#xXXXX; e.g. "&#xFFFE;&#x00AC;&#xC8FE;")\endhtmlonly. * In the Event the converter doesn't support the characters {&,#,x}[0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * \htmlonly&#x23456;\endhtmlonly
  • *
* @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE ( const void *context, UConverterFromUnicodeArgs *fromUArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This To Unicode callback skips any ILLEGAL_SEQUENCE, or * skips only UNASSIGNED_SEQUENCE depending on the context parameter * simply ignoring those characters. * * @param context The function currently recognizes the callback options: * UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Skips any ILLEGAL_SEQUENCE * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP ( const void *context, UConverterToUnicodeArgs *toUArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or * UNASSIGNED_SEQUENCE depending on context parameter, with the * Unicode substitution character, U+FFFD. * * @param context The function currently recognizes the callback options: * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Substitutes any ILLEGAL_SEQUENCE * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE ( const void *context, UConverterToUnicodeArgs *toUArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the * hexadecimal representation of the illegal bytes * (in the format %XNN, e.g. "%XFF%X0A%XC8%X03"). * * @param context This function currently recognizes the callback options: * UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC, * UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE. * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE ( const void *context, UConverterToUnicodeArgs *toUArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err); #endif #endif /*UCNV_ERR_H*/ // ucnv.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * ucnv.h: * External APIs for the ICU's codeset conversion library * Bertrand A. Damiba * * Modification History: * * Date Name Description * 04/04/99 helena Fixed internal header inclusion. * 05/11/00 helena Added setFallback and usesFallback APIs. * 06/29/2000 helena Major rewrite of the callback APIs. * 12/07/2000 srl Update of documentation */ /** * \file * \brief C API: Character conversion * *

Character Conversion C API

* *

This API is used to convert codepage or character encoded data to and * from UTF-16. You can open a converter with {@link ucnv_open() }. With that * converter, you can get its properties, set options, convert your data and * close the converter.

* *

Since many software programs recognize different converter names for * different types of converters, there are other functions in this API to * iterate over the converter aliases. The functions {@link ucnv_getAvailableName() }, * {@link ucnv_getAlias() } and {@link ucnv_getStandardName() } are some of the * more frequently used alias functions to get this information.

* *

When a converter encounters an illegal, irregular, invalid or unmappable character * its default behavior is to use a substitution character to replace the * bad byte sequence. This behavior can be changed by using {@link ucnv_setFromUCallBack() } * or {@link ucnv_setToUCallBack() } on the converter. The header ucnv_err.h defines * many other callback actions that can be used instead of a character substitution.

* *

More information about this API can be found in our * User Guide.

*/ #ifndef UCNV_H #define UCNV_H #if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN) #define USET_DEFINED /** * USet is the C API type corresponding to C++ class UnicodeSet. * It is forward-declared here to avoid including unicode/uset.h file if related * conversion APIs are not used. * * @see ucnv_getUnicodeSet * @stable ICU 2.4 */ typedef struct USet USet; #endif #if !UCONFIG_NO_CONVERSION U_CDECL_BEGIN /** Maximum length of a converter name including the terminating NULL @stable ICU 2.0 */ #define UCNV_MAX_CONVERTER_NAME_LENGTH 60 /** Maximum length of a converter name including path and terminating NULL @stable ICU 2.0 */ #define UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) /** Shift in for EBDCDIC_STATEFUL and iso2022 states @stable ICU 2.0 */ #define UCNV_SI 0x0F /** Shift out for EBDCDIC_STATEFUL and iso2022 states @stable ICU 2.0 */ #define UCNV_SO 0x0E /** * Enum for specifying basic types of converters * @see ucnv_getType * @stable ICU 2.0 */ typedef enum { /** @stable ICU 2.0 */ UCNV_UNSUPPORTED_CONVERTER = -1, /** @stable ICU 2.0 */ UCNV_SBCS = 0, /** @stable ICU 2.0 */ UCNV_DBCS = 1, /** @stable ICU 2.0 */ UCNV_MBCS = 2, /** @stable ICU 2.0 */ UCNV_LATIN_1 = 3, /** @stable ICU 2.0 */ UCNV_UTF8 = 4, /** @stable ICU 2.0 */ UCNV_UTF16_BigEndian = 5, /** @stable ICU 2.0 */ UCNV_UTF16_LittleEndian = 6, /** @stable ICU 2.0 */ UCNV_UTF32_BigEndian = 7, /** @stable ICU 2.0 */ UCNV_UTF32_LittleEndian = 8, /** @stable ICU 2.0 */ UCNV_EBCDIC_STATEFUL = 9, /** @stable ICU 2.0 */ UCNV_ISO_2022 = 10, /** @stable ICU 2.0 */ UCNV_LMBCS_1 = 11, /** @stable ICU 2.0 */ UCNV_LMBCS_2, /** @stable ICU 2.0 */ UCNV_LMBCS_3, /** @stable ICU 2.0 */ UCNV_LMBCS_4, /** @stable ICU 2.0 */ UCNV_LMBCS_5, /** @stable ICU 2.0 */ UCNV_LMBCS_6, /** @stable ICU 2.0 */ UCNV_LMBCS_8, /** @stable ICU 2.0 */ UCNV_LMBCS_11, /** @stable ICU 2.0 */ UCNV_LMBCS_16, /** @stable ICU 2.0 */ UCNV_LMBCS_17, /** @stable ICU 2.0 */ UCNV_LMBCS_18, /** @stable ICU 2.0 */ UCNV_LMBCS_19, /** @stable ICU 2.0 */ UCNV_LMBCS_LAST = UCNV_LMBCS_19, /** @stable ICU 2.0 */ UCNV_HZ, /** @stable ICU 2.0 */ UCNV_SCSU, /** @stable ICU 2.0 */ UCNV_ISCII, /** @stable ICU 2.0 */ UCNV_US_ASCII, /** @stable ICU 2.0 */ UCNV_UTF7, /** @stable ICU 2.2 */ UCNV_BOCU1, /** @stable ICU 2.2 */ UCNV_UTF16, /** @stable ICU 2.2 */ UCNV_UTF32, /** @stable ICU 2.2 */ UCNV_CESU8, /** @stable ICU 2.4 */ UCNV_IMAP_MAILBOX, /** @stable ICU 4.8 */ UCNV_COMPOUND_TEXT, /* Number of converter types for which we have conversion routines. */ UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES } UConverterType; /** * Enum for specifying which platform a converter ID refers to. * The use of platform/CCSID is not recommended. See ucnv_openCCSID(). * * @see ucnv_getPlatform * @see ucnv_openCCSID * @see ucnv_getCCSID * @stable ICU 2.0 */ typedef enum { UCNV_UNKNOWN = -1, UCNV_IBM = 0 } UConverterPlatform; /** * Function pointer for error callback in the codepage to unicode direction. * Called when an error has occurred in conversion to unicode, or on open/close of the callback (see reason). * @param context Pointer to the callback's private data * @param args Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param pErrorCode ICU error code in/out parameter. * For converter callback functions, set to a conversion error * before the call, and the callback may reset it to U_ZERO_ERROR. * @see ucnv_setToUCallBack * @see UConverterToUnicodeArgs * @stable ICU 2.0 */ typedef void (U_EXPORT2 *UConverterToUCallback) ( const void* context, UConverterToUnicodeArgs *args, const char *codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode *pErrorCode); /** * Function pointer for error callback in the unicode to codepage direction. * Called when an error has occurred in conversion from unicode, or on open/close of the callback (see reason). * @param context Pointer to the callback's private data * @param args Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param pErrorCode ICU error code in/out parameter. * For converter callback functions, set to a conversion error * before the call, and the callback may reset it to U_ZERO_ERROR. * @see ucnv_setFromUCallBack * @stable ICU 2.0 */ typedef void (U_EXPORT2 *UConverterFromUCallback) ( const void* context, UConverterFromUnicodeArgs *args, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *pErrorCode); U_CDECL_END /** * Character that separates converter names from options and options from each other. * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_OPTION_SEP_CHAR ',' /** * String version of UCNV_OPTION_SEP_CHAR. * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_OPTION_SEP_STRING "," /** * Character that separates a converter option from its value. * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_VALUE_SEP_CHAR '=' /** * String version of UCNV_VALUE_SEP_CHAR. * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_VALUE_SEP_STRING "=" /** * Converter option for specifying a locale. * For example, ucnv_open("SCSU,locale=ja", &errorCode); * See convrtrs.txt. * * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_LOCALE_OPTION_STRING ",locale=" /** * Converter option for specifying a version selector (0..9) for some converters. * For example, * \code * ucnv_open("UTF-7,version=1", &errorCode); * \endcode * See convrtrs.txt. * * @see ucnv_open * @stable ICU 2.4 */ #define UCNV_VERSION_OPTION_STRING ",version=" /** * Converter option for EBCDIC SBCS or mixed-SBCS/DBCS (stateful) codepages. * Swaps Unicode mappings for EBCDIC LF and NL codes, as used on * S/390 (z/OS) Unix System Services (Open Edition). * For example, ucnv_open("ibm-1047,swaplfnl", &errorCode); * See convrtrs.txt. * * @see ucnv_open * @stable ICU 2.4 */ #define UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl" /** * Do a fuzzy compare of two converter/alias names. * The comparison is case-insensitive, ignores leading zeroes if they are not * followed by further digits, and ignores all but letters and digits. * Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent. * See section 1.4, Charset Alias Matching in Unicode Technical Standard #22 * at http://www.unicode.org/reports/tr22/ * * @param name1 a converter name or alias, zero-terminated * @param name2 a converter name or alias, zero-terminated * @return 0 if the names match, or a negative value if the name1 * lexically precedes name2, or a positive value if the name1 * lexically follows name2. * @stable ICU 2.0 */ U_CAPI int U_EXPORT2 ucnv_compareNames(const char *name1, const char *name2); /** * Creates a UConverter object with the name of a coded character set specified as a C string. * The actual name will be resolved with the alias file * using a case-insensitive string comparison that ignores * leading zeroes and all non-alphanumeric characters. * E.g., the names "UTF8", "utf-8", "u*T@f08" and "Utf 8" are all equivalent. * (See also ucnv_compareNames().) * If NULL is passed for the converter name, it will create one with the * getDefaultName return value. * *

A converter name for ICU 1.5 and above may contain options * like a locale specification to control the specific behavior of * the newly instantiated converter. * The meaning of the options depends on the particular converter. * If an option is not defined for or recognized by a given converter, then it is ignored.

* *

Options are appended to the converter name string, with a * UCNV_OPTION_SEP_CHAR between the name and the first option and * also between adjacent options.

* *

If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING.

* *

The conversion behavior and names can vary between platforms. ICU may * convert some characters differently from other platforms. Details on this topic * are in the User * Guide. Aliases starting with a "cp" prefix have no specific meaning * other than its an alias starting with the letters "cp". Please do not * associate any meaning to these aliases.

* * \snippet samples/ucnv/convsamp.cpp ucnv_open * * @param converterName Name of the coded character set table. * This may have options appended to the string. * IANA alias character set names, IBM CCSIDs starting with "ibm-", * Windows codepage numbers starting with "windows-" are frequently * used for this parameter. See ucnv_getAvailableName and * ucnv_getAlias for a complete list that is available. * If this parameter is NULL, the default converter will be used. * @param err outgoing error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an error occurred * @see ucnv_openU * @see ucnv_openCCSID * @see ucnv_getAvailableName * @see ucnv_getAlias * @see ucnv_getDefaultName * @see ucnv_close * @see ucnv_compareNames * @stable ICU 2.0 */ U_CAPI UConverter* U_EXPORT2 ucnv_open(const char *converterName, UErrorCode *err); /** * Creates a Unicode converter with the names specified as unicode string. * The name should be limited to the ASCII-7 alphanumerics range. * The actual name will be resolved with the alias file * using a case-insensitive string comparison that ignores * leading zeroes and all non-alphanumeric characters. * E.g., the names "UTF8", "utf-8", "u*T@f08" and "Utf 8" are all equivalent. * (See also ucnv_compareNames().) * If NULL is passed for the converter name, it will create * one with the ucnv_getDefaultName() return value. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * *

See ucnv_open for the complete details

* @param name Name of the UConverter table in a zero terminated * Unicode string * @param err outgoing error status U_MEMORY_ALLOCATION_ERROR, * U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an * error occurred * @see ucnv_open * @see ucnv_openCCSID * @see ucnv_close * @see ucnv_compareNames * @stable ICU 2.0 */ U_CAPI UConverter* U_EXPORT2 ucnv_openU(const UChar *name, UErrorCode *err); /** * Creates a UConverter object from a CCSID number and platform pair. * Note that the usefulness of this function is limited to platforms with numeric * encoding IDs. Only IBM and Microsoft platforms use numeric (16-bit) identifiers for * encodings. * * In addition, IBM CCSIDs and Unicode conversion tables are not 1:1 related. * For many IBM CCSIDs there are multiple (up to six) Unicode conversion tables, and * for some Unicode conversion tables there are multiple CCSIDs. * Some "alternate" Unicode conversion tables are provided by the * IBM CDRA conversion table registry. * The most prominent example of a systematic modification of conversion tables that is * not provided in the form of conversion table files in the repository is * that S/390 Unix System Services swaps the codes for Line Feed and New Line in all * EBCDIC codepages, which requires such a swap in the Unicode conversion tables as well. * * Only IBM default conversion tables are accessible with ucnv_openCCSID(). * ucnv_getCCSID() will return the same CCSID for all conversion tables that are associated * with that CCSID. * * Currently, the only "platform" supported in the ICU converter API is UCNV_IBM. * * In summary, the use of CCSIDs and the associated API functions is not recommended. * * In order to open a converter with the default IBM CDRA Unicode conversion table, * you can use this function or use the prefix "ibm-": * \code * char name[20]; * sprintf(name, "ibm-%hu", ccsid); * cnv=ucnv_open(name, &errorCode); * \endcode * * In order to open a converter with the IBM S/390 Unix System Services variant * of a Unicode/EBCDIC conversion table, * you can use the prefix "ibm-" together with the option string UCNV_SWAP_LFNL_OPTION_STRING: * \code * char name[20]; * sprintf(name, "ibm-%hu" UCNV_SWAP_LFNL_OPTION_STRING, ccsid); * cnv=ucnv_open(name, &errorCode); * \endcode * * In order to open a converter from a Microsoft codepage number, use the prefix "cp": * \code * char name[20]; * sprintf(name, "cp%hu", codepageID); * cnv=ucnv_open(name, &errorCode); * \endcode * * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * * @param codepage codepage number to create * @param platform the platform in which the codepage number exists * @param err error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an error * occurred. * @see ucnv_open * @see ucnv_openU * @see ucnv_close * @see ucnv_getCCSID * @see ucnv_getPlatform * @see UConverterPlatform * @stable ICU 2.0 */ U_CAPI UConverter* U_EXPORT2 ucnv_openCCSID(int32_t codepage, UConverterPlatform platform, UErrorCode * err); /** *

Creates a UConverter object specified from a packageName and a converterName.

* *

The packageName and converterName must point to an ICU udata object, as defined by * udata_open( packageName, "cnv", converterName, err) or equivalent. * Typically, packageName will refer to a (.dat) file, or to a package registered with * udata_setAppData(). Using a full file or directory pathname for packageName is deprecated.

* *

The name will NOT be looked up in the alias mechanism, nor will the converter be * stored in the converter cache or the alias table. The only way to open further converters * is call this function multiple times, or use the ucnv_safeClone() function to clone a * 'primary' converter.

* *

A future version of ICU may add alias table lookups and/or caching * to this function.

* *

Example Use: * cnv = ucnv_openPackage("myapp", "myconverter", &err); *

* * @param packageName name of the package (equivalent to 'path' in udata_open() call) * @param converterName name of the data item to be used, without suffix. * @param err outgoing error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an error occurred * @see udata_open * @see ucnv_open * @see ucnv_safeClone * @see ucnv_close * @stable ICU 2.2 */ U_CAPI UConverter* U_EXPORT2 ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode *err); /** * Thread safe converter cloning operation. * For most efficient operation, pass in a stackBuffer (and a *pBufferSize) * with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space. * If the buffer size is sufficient, then the clone will use the stack buffer; * otherwise, it will be allocated, and *pBufferSize will indicate * the actual size. (This should not occur with U_CNV_SAFECLONE_BUFFERSIZE.) * * You must ucnv_close() the clone in any case. * * If *pBufferSize==0, (regardless of whether stackBuffer==NULL or not) * then *pBufferSize will be changed to a sufficient size * for cloning this converter, * without actually cloning the converter ("pure pre-flighting"). * * If *pBufferSize is greater than zero but not large enough for a stack-based * clone, then the converter is cloned using newly allocated memory * and *pBufferSize is changed to the necessary size. * * If the converter clone fits into the stack buffer but the stack buffer is not * sufficiently aligned for the clone, then the clone will use an * adjusted pointer and use an accordingly smaller buffer size. * * @param cnv converter to be cloned * @param stackBuffer Deprecated functionality as of ICU 52, use NULL.
* user allocated space for the new clone. If NULL new memory will be allocated. * If buffer is not large enough, new memory will be allocated. * Clients can use the U_CNV_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations. * @param pBufferSize Deprecated functionality as of ICU 52, use NULL or 1.
* pointer to size of allocated space. * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_WARNING, * is used if any allocations were necessary. * However, it is better to check if *pBufferSize grew for checking for * allocations because warning codes can be overridden by subsequent * function calls. * @return pointer to the new clone * @stable ICU 2.0 */ U_CAPI UConverter * U_EXPORT2 ucnv_safeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) U_CAPI UConverter* U_EXPORT2 ucnv_clone(const UConverter *cnv, UErrorCode *status); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Deletes the unicode converter and releases resources associated * with just this instance. * Does not free up shared converter tables. * * @param converter the converter object to be deleted * @see ucnv_open * @see ucnv_openU * @see ucnv_openCCSID * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_close(UConverter * converter); /** * Fills in the output parameter, subChars, with the substitution characters * as multiple bytes. * If ucnv_setSubstString() set a Unicode string because the converter is * stateful, then subChars will be an empty string. * * @param converter the Unicode converter * @param subChars the substitution characters * @param len on input the capacity of subChars, on output the number * of bytes copied to it * @param err the outgoing error status code. * If the substitution character array is too small, an * U_INDEX_OUTOFBOUNDS_ERROR will be returned. * @see ucnv_setSubstString * @see ucnv_setSubstChars * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_getSubstChars(const UConverter *converter, char *subChars, int8_t *len, UErrorCode *err); /** * Sets the substitution chars when converting from unicode to a codepage. The * substitution is specified as a string of 1-4 bytes, and may contain * NULL bytes. * The subChars must represent a single character. The caller needs to know the * byte sequence of a valid character in the converter's charset. * For some converters, for example some ISO 2022 variants, only single-byte * substitution characters may be supported. * The newer ucnv_setSubstString() function relaxes these limitations. * * @param converter the Unicode converter * @param subChars the substitution character byte sequence we want set * @param len the number of bytes in subChars * @param err the error status code. U_INDEX_OUTOFBOUNDS_ERROR if * len is bigger than the maximum number of bytes allowed in subchars * @see ucnv_setSubstString * @see ucnv_getSubstChars * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_setSubstChars(UConverter *converter, const char *subChars, int8_t len, UErrorCode *err); /** * Set a substitution string for converting from Unicode to a charset. * The caller need not know the charset byte sequence for each charset. * * Unlike ucnv_setSubstChars() which is designed to set a charset byte sequence * for a single character, this function takes a Unicode string with * zero, one or more characters, and immediately verifies that the string can be * converted to the charset. * If not, or if the result is too long (more than 32 bytes as of ICU 3.6), * then the function returns with an error accordingly. * * Also unlike ucnv_setSubstChars(), this function works for stateful charsets * by converting on the fly at the point of substitution rather than setting * a fixed byte sequence. * * @param cnv The UConverter object. * @param s The Unicode string. * @param length The number of UChars in s, or -1 for a NUL-terminated string. * @param err Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * * @see ucnv_setSubstChars * @see ucnv_getSubstChars * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ucnv_setSubstString(UConverter *cnv, const UChar *s, int32_t length, UErrorCode *err); /** * Fills in the output parameter, errBytes, with the error characters from the * last failing conversion. * * @param converter the Unicode converter * @param errBytes the codepage bytes which were in error * @param len on input the capacity of errBytes, on output the number of * bytes which were copied to it * @param err the error status code. * If the substitution character array is too small, an * U_INDEX_OUTOFBOUNDS_ERROR will be returned. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_getInvalidChars(const UConverter *converter, char *errBytes, int8_t *len, UErrorCode *err); /** * Fills in the output parameter, errChars, with the error characters from the * last failing conversion. * * @param converter the Unicode converter * @param errUChars the UChars which were in error * @param len on input the capacity of errUChars, on output the number of * UChars which were copied to it * @param err the error status code. * If the substitution character array is too small, an * U_INDEX_OUTOFBOUNDS_ERROR will be returned. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_getInvalidUChars(const UConverter *converter, UChar *errUChars, int8_t *len, UErrorCode *err); /** * Resets the state of a converter to the default state. This is used * in the case of an error, to restart a conversion from a known default state. * It will also empty the internal output buffers. * @param converter the Unicode converter * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_reset(UConverter *converter); /** * Resets the to-Unicode part of a converter state to the default state. * This is used in the case of an error to restart a conversion to * Unicode to a known default state. It will also empty the internal * output buffers used for the conversion to Unicode codepoints. * @param converter the Unicode converter * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_resetToUnicode(UConverter *converter); /** * Resets the from-Unicode part of a converter state to the default state. * This is used in the case of an error to restart a conversion from * Unicode to a known default state. It will also empty the internal output * buffers used for the conversion from Unicode codepoints. * @param converter the Unicode converter * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_resetFromUnicode(UConverter *converter); /** * Returns the maximum number of bytes that are output per UChar in conversion * from Unicode using this converter. * The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING * to calculate the size of a target buffer for conversion from Unicode. * * Note: Before ICU 2.8, this function did not return reliable numbers for * some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS. * * This number may not be the same as the maximum number of bytes per * "conversion unit". In other words, it may not be the intuitively expected * number of bytes per character that would be published for a charset, * and may not fulfill any other purpose than the allocation of an output * buffer of guaranteed sufficient size for a given input length and converter. * * Examples for special cases that are taken into account: * - Supplementary code points may convert to more bytes than BMP code points. * This function returns bytes per UChar (UTF-16 code unit), not per * Unicode code point, for efficient buffer allocation. * - State-shifting output (SI/SO, escapes, etc.) from stateful converters. * - When m input UChars are converted to n output bytes, then the maximum m/n * is taken into account. * * The number returned here does not take into account * (see UCNV_GET_MAX_BYTES_FOR_STRING): * - callbacks which output more than one charset character sequence per call, * like escape callbacks * - initial and final non-character bytes that are output by some converters * (automatic BOMs, initial escape sequence, final SI, etc.) * * Examples for returned values: * - SBCS charsets: 1 * - Shift-JIS: 2 * - UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted) * - UTF-8: 3 (3 per BMP, 4 per surrogate _pair_) * - EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS) * - ISO-2022: 3 (always outputs UTF-8) * - ISO-2022-JP: 6 (4-byte escape sequences + DBCS) * - ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 + DBCS) * * @param converter The Unicode converter. * @return The maximum number of bytes per UChar (16 bit code unit) * that are output by ucnv_fromUnicode(), * to be used together with UCNV_GET_MAX_BYTES_FOR_STRING * for buffer allocation. * * @see UCNV_GET_MAX_BYTES_FOR_STRING * @see ucnv_getMinCharSize * @stable ICU 2.0 */ U_CAPI int8_t U_EXPORT2 ucnv_getMaxCharSize(const UConverter *converter); /** * Calculates the size of a buffer for conversion from Unicode to a charset. * The calculated size is guaranteed to be sufficient for this conversion. * * It takes into account initial and final non-character bytes that are output * by some converters. * It does not take into account callbacks which output more than one charset * character sequence per call, like escape callbacks. * The default (substitution) callback only outputs one charset character sequence. * * @param length Number of UChars to be converted. * @param maxCharSize Return value from ucnv_getMaxCharSize() for the converter * that will be used. * @return Size of a buffer that will be large enough to hold the output bytes of * converting length UChars with the converter that returned the maxCharSize. * * @see ucnv_getMaxCharSize * @stable ICU 2.8 */ #define UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) \ (((int32_t)(length)+10)*(int32_t)(maxCharSize)) /** * Returns the minimum byte length (per codepoint) for characters in this codepage. * This is usually either 1 or 2. * @param converter the Unicode converter * @return the minimum number of bytes per codepoint allowed by this particular converter * @see ucnv_getMaxCharSize * @stable ICU 2.0 */ U_CAPI int8_t U_EXPORT2 ucnv_getMinCharSize(const UConverter *converter); /** * Returns the display name of the converter passed in based on the Locale * passed in. If the locale contains no display name, the internal ASCII * name will be filled in. * * @param converter the Unicode converter. * @param displayLocale is the specific Locale we want to localized for * @param displayName user provided buffer to be filled in * @param displayNameCapacity size of displayName Buffer * @param err error status code * @return displayNameLength number of UChar needed in displayName * @see ucnv_getName * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucnv_getDisplayName(const UConverter *converter, const char *displayLocale, UChar *displayName, int32_t displayNameCapacity, UErrorCode *err); /** * Gets the internal, canonical name of the converter (zero-terminated). * The lifetime of the returned string will be that of the converter * passed to this function. * @param converter the Unicode converter * @param err UErrorCode status * @return the internal name of the converter * @see ucnv_getDisplayName * @stable ICU 2.0 */ U_CAPI const char * U_EXPORT2 ucnv_getName(const UConverter *converter, UErrorCode *err); /** * Gets a codepage number associated with the converter. This is not guaranteed * to be the one used to create the converter. Some converters do not represent * platform registered codepages and return zero for the codepage number. * The error code fill-in parameter indicates if the codepage number * is available. * Does not check if the converter is NULL or if converter's data * table is NULL. * * Important: The use of CCSIDs is not recommended because it is limited * to only two platforms in principle and only one (UCNV_IBM) in the current * ICU converter API. * Also, CCSIDs are insufficient to identify IBM Unicode conversion tables precisely. * For more details see ucnv_openCCSID(). * * @param converter the Unicode converter * @param err the error status code. * @return If any error occurs, -1 will be returned otherwise, the codepage number * will be returned * @see ucnv_openCCSID * @see ucnv_getPlatform * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucnv_getCCSID(const UConverter *converter, UErrorCode *err); /** * Gets a codepage platform associated with the converter. Currently, * only UCNV_IBM will be returned. * Does not test if the converter is NULL or if converter's data * table is NULL. * @param converter the Unicode converter * @param err the error status code. * @return The codepage platform * @stable ICU 2.0 */ U_CAPI UConverterPlatform U_EXPORT2 ucnv_getPlatform(const UConverter *converter, UErrorCode *err); /** * Gets the type of the converter * e.g. SBCS, MBCS, DBCS, UTF8, UTF16_BE, UTF16_LE, ISO_2022, * EBCDIC_STATEFUL, LATIN_1 * @param converter a valid, opened converter * @return the type of the converter * @stable ICU 2.0 */ U_CAPI UConverterType U_EXPORT2 ucnv_getType(const UConverter * converter); /** * Gets the "starter" (lead) bytes for converters of type MBCS. * Will fill in an U_ILLEGAL_ARGUMENT_ERROR if converter passed in * is not MBCS. Fills in an array of type UBool, with the value of the byte * as offset to the array. For example, if (starters[0x20] == true) at return, * it means that the byte 0x20 is a starter byte in this converter. * Context pointers are always owned by the caller. * * @param converter a valid, opened converter of type MBCS * @param starters an array of size 256 to be filled in * @param err error status, U_ILLEGAL_ARGUMENT_ERROR if the * converter is not a type which can return starters. * @see ucnv_getType * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_getStarters(const UConverter* converter, UBool starters[256], UErrorCode* err); /** * Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet(). * @see ucnv_getUnicodeSet * @stable ICU 2.6 */ typedef enum UConverterUnicodeSet { /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ UCNV_ROUNDTRIP_SET, /** Select the set of Unicode code points with roundtrip or fallback mappings. @stable ICU 4.0 */ UCNV_ROUNDTRIP_AND_FALLBACK_SET, } UConverterUnicodeSet; /** * Returns the set of Unicode code points that can be converted by an ICU converter. * * Returns one of several kinds of set: * * 1. UCNV_ROUNDTRIP_SET * * The set of all Unicode code points that can be roundtrip-converted * (converted without any data loss) with the converter (ucnv_fromUnicode()). * This set will not include code points that have fallback mappings * or are only the result of reverse fallback mappings. * This set will also not include PUA code points with fallbacks, although * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). * See UTR #22 "Character Mapping Markup Language" * at http://www.unicode.org/reports/tr22/ * * This is useful for example for * - checking that a string or document can be roundtrip-converted with a converter, * without/before actually performing the conversion * - testing if a converter can be used for text for typical text for a certain locale, * by comparing its roundtrip set with the set of ExemplarCharacters from * ICU's locale data or other sources * * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET * * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) * when fallbacks are turned on (see ucnv_setFallback()). * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). * * In the future, there may be more UConverterUnicodeSet choices to select * sets with different properties. * * @param cnv The converter for which a set is requested. * @param setFillIn A valid USet *. It will be cleared by this function before * the converter's specific set is filled into the USet. * @param whichSet A UConverterUnicodeSet selector; * currently UCNV_ROUNDTRIP_SET is the only supported value. * @param pErrorCode ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * * @see UConverterUnicodeSet * @see uset_open * @see uset_close * @stable ICU 2.6 */ U_CAPI void U_EXPORT2 ucnv_getUnicodeSet(const UConverter *cnv, USet *setFillIn, UConverterUnicodeSet whichSet, UErrorCode *pErrorCode); /** * Gets the current callback function used by the converter when an illegal * or invalid codepage sequence is found. * Context pointers are always owned by the caller. * * @param converter the unicode converter * @param action fillin: returns the callback function pointer * @param context fillin: returns the callback's private void* context * @see ucnv_setToUCallBack * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_getToUCallBack (const UConverter * converter, UConverterToUCallback *action, const void **context); /** * Gets the current callback function used by the converter when illegal * or invalid Unicode sequence is found. * Context pointers are always owned by the caller. * * @param converter the unicode converter * @param action fillin: returns the callback function pointer * @param context fillin: returns the callback's private void* context * @see ucnv_setFromUCallBack * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_getFromUCallBack (const UConverter * converter, UConverterFromUCallback *action, const void **context); /** * Changes the callback function used by the converter when * an illegal or invalid sequence is found. * Context pointers are always owned by the caller. * Predefined actions and contexts can be found in the ucnv_err.h header. * * @param converter the unicode converter * @param newAction the new callback function * @param newContext the new toUnicode callback context pointer. This can be NULL. * @param oldAction fillin: returns the old callback function pointer. This can be NULL. * @param oldContext fillin: returns the old callback's private void* context. This can be NULL. * @param err The error code status * @see ucnv_getToUCallBack * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_setToUCallBack (UConverter * converter, UConverterToUCallback newAction, const void* newContext, UConverterToUCallback *oldAction, const void** oldContext, UErrorCode * err); /** * Changes the current callback function used by the converter when * an illegal or invalid sequence is found. * Context pointers are always owned by the caller. * Predefined actions and contexts can be found in the ucnv_err.h header. * * @param converter the unicode converter * @param newAction the new callback function * @param newContext the new fromUnicode callback context pointer. This can be NULL. * @param oldAction fillin: returns the old callback function pointer. This can be NULL. * @param oldContext fillin: returns the old callback's private void* context. This can be NULL. * @param err The error code status * @see ucnv_getFromUCallBack * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_setFromUCallBack (UConverter * converter, UConverterFromUCallback newAction, const void *newContext, UConverterFromUCallback *oldAction, const void **oldContext, UErrorCode * err); /** * Converts an array of unicode characters to an array of codepage * characters. This function is optimized for converting a continuous * stream of data in buffer-sized chunks, where the entire source and * target does not fit in available buffers. * * The source pointer is an in/out parameter. It starts out pointing where the * conversion is to begin, and ends up pointing after the last UChar consumed. * * Target similarly starts out pointer at the first available byte in the output * buffer, and ends up pointing after the last byte written to the output. * * The converter always attempts to consume the entire source buffer, unless * (1.) the target buffer is full, or (2.) a failing error is returned from the * current callback function. When a successful error status has been * returned, it means that all of the source buffer has been * consumed. At that point, the caller should reset the source and * sourceLimit pointers to point to the next chunk. * * At the end of the stream (flush==true), the input is completely consumed * when *source==sourceLimit and no error code is set. * The converter object is then automatically reset by this function. * (This means that a converter need not be reset explicitly between data * streams if it finishes the previous stream without errors.) * * This is a stateful conversion. Additionally, even when all source data has * been consumed, some data may be in the converters' internal state. * Call this function repeatedly, updating the target pointers with * the next empty chunk of target in case of a * U_BUFFER_OVERFLOW_ERROR, and updating the source pointers * with the next chunk of source when a successful error status is * returned, until there are no more chunks of source data. * @param converter the Unicode converter * @param target I/O parameter. Input : Points to the beginning of the buffer to copy * codepage characters to. Output : points to after the last codepage character copied * to target. * @param targetLimit the pointer just after last of the target buffer * @param source I/O parameter, pointer to pointer to the source Unicode character buffer. * @param sourceLimit the pointer just after the last of the source buffer * @param offsets if NULL is passed, nothing will happen to it, otherwise it needs to have the same number * of allocated cells as target. Will fill in offsets from target to source pointer * e.g: offsets[3] is equal to 6, it means that the target[3] was a result of transcoding source[6] * For output data carried across calls, and other data without a specific source character * (such as from escape sequences or callbacks) -1 will be placed for offsets. * @param flush set to true if the current source buffer is the last available * chunk of the source, false otherwise. Note that if a failing status is returned, * this function may have to be called multiple times with flush set to true until * the source buffer is consumed. * @param err the error status. U_ILLEGAL_ARGUMENT_ERROR will be set if the * converter is NULL. * U_BUFFER_OVERFLOW_ERROR will be set if the target is full and there is * still data to be written to the target. * @see ucnv_fromUChars * @see ucnv_convert * @see ucnv_getMinCharSize * @see ucnv_setToUCallBack * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_fromUnicode (UConverter * converter, char **target, const char *targetLimit, const UChar ** source, const UChar * sourceLimit, int32_t* offsets, UBool flush, UErrorCode * err); /** * Converts a buffer of codepage bytes into an array of unicode UChars * characters. This function is optimized for converting a continuous * stream of data in buffer-sized chunks, where the entire source and * target does not fit in available buffers. * * The source pointer is an in/out parameter. It starts out pointing where the * conversion is to begin, and ends up pointing after the last byte of source consumed. * * Target similarly starts out pointer at the first available UChar in the output * buffer, and ends up pointing after the last UChar written to the output. * It does NOT necessarily keep UChar sequences together. * * The converter always attempts to consume the entire source buffer, unless * (1.) the target buffer is full, or (2.) a failing error is returned from the * current callback function. When a successful error status has been * returned, it means that all of the source buffer has been * consumed. At that point, the caller should reset the source and * sourceLimit pointers to point to the next chunk. * * At the end of the stream (flush==true), the input is completely consumed * when *source==sourceLimit and no error code is set * The converter object is then automatically reset by this function. * (This means that a converter need not be reset explicitly between data * streams if it finishes the previous stream without errors.) * * This is a stateful conversion. Additionally, even when all source data has * been consumed, some data may be in the converters' internal state. * Call this function repeatedly, updating the target pointers with * the next empty chunk of target in case of a * U_BUFFER_OVERFLOW_ERROR, and updating the source pointers * with the next chunk of source when a successful error status is * returned, until there are no more chunks of source data. * @param converter the Unicode converter * @param target I/O parameter. Input : Points to the beginning of the buffer to copy * UChars into. Output : points to after the last UChar copied. * @param targetLimit the pointer just after the end of the target buffer * @param source I/O parameter, pointer to pointer to the source codepage buffer. * @param sourceLimit the pointer to the byte after the end of the source buffer * @param offsets if NULL is passed, nothing will happen to it, otherwise it needs to have the same number * of allocated cells as target. Will fill in offsets from target to source pointer * e.g: offsets[3] is equal to 6, it means that the target[3] was a result of transcoding source[6] * For output data carried across calls, and other data without a specific source character * (such as from escape sequences or callbacks) -1 will be placed for offsets. * @param flush set to true if the current source buffer is the last available * chunk of the source, false otherwise. Note that if a failing status is returned, * this function may have to be called multiple times with flush set to true until * the source buffer is consumed. * @param err the error status. U_ILLEGAL_ARGUMENT_ERROR will be set if the * converter is NULL. * U_BUFFER_OVERFLOW_ERROR will be set if the target is full and there is * still data to be written to the target. * @see ucnv_fromUChars * @see ucnv_convert * @see ucnv_getMinCharSize * @see ucnv_setFromUCallBack * @see ucnv_getNextUChar * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_toUnicode(UConverter *converter, UChar **target, const UChar *targetLimit, const char **source, const char *sourceLimit, int32_t *offsets, UBool flush, UErrorCode *err); /** * Convert the Unicode string into a codepage string using an existing UConverter. * The output string is NUL-terminated if possible. * * This function is a more convenient but less powerful version of ucnv_fromUnicode(). * It is only useful for whole strings, not for streaming conversion. * * The maximum output buffer capacity required (barring output from callbacks) will be * UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)). * * @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called) * @param src the input Unicode string * @param srcLength the input string length, or -1 if NUL-terminated * @param dest destination string buffer, can be NULL if destCapacity==0 * @param destCapacity the number of chars available at dest * @param pErrorCode normal ICU error code; * common error codes that may be set by this function include * U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, * U_ILLEGAL_ARGUMENT_ERROR, and conversion errors * @return the length of the output string, not counting the terminating NUL; * if the length is greater than destCapacity, then the string will not fit * and a buffer of the indicated length would need to be passed in * @see ucnv_fromUnicode * @see ucnv_convert * @see UCNV_GET_MAX_BYTES_FOR_STRING * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucnv_fromUChars(UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert the codepage string into a Unicode string using an existing UConverter. * The output string is NUL-terminated if possible. * * This function is a more convenient but less powerful version of ucnv_toUnicode(). * It is only useful for whole strings, not for streaming conversion. * * The maximum output buffer capacity required (barring output from callbacks) will be * 2*srcLength (each char may be converted into a surrogate pair). * * @param cnv the converter object to be used (ucnv_resetToUnicode() will be called) * @param src the input codepage string * @param srcLength the input string length, or -1 if NUL-terminated * @param dest destination string buffer, can be NULL if destCapacity==0 * @param destCapacity the number of UChars available at dest * @param pErrorCode normal ICU error code; * common error codes that may be set by this function include * U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, * U_ILLEGAL_ARGUMENT_ERROR, and conversion errors * @return the length of the output string, not counting the terminating NUL; * if the length is greater than destCapacity, then the string will not fit * and a buffer of the indicated length would need to be passed in * @see ucnv_toUnicode * @see ucnv_convert * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucnv_toUChars(UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a codepage buffer into Unicode one character at a time. * The input is completely consumed when the U_INDEX_OUTOFBOUNDS_ERROR is set. * * Advantage compared to ucnv_toUnicode() or ucnv_toUChars(): * - Faster for small amounts of data, for most converters, e.g., * US-ASCII, ISO-8859-1, UTF-8/16/32, and most "normal" charsets. * (For complex converters, e.g., SCSU, UTF-7 and ISO 2022 variants, * it uses ucnv_toUnicode() internally.) * - Convenient. * * Limitations compared to ucnv_toUnicode(): * - Always assumes flush=true. * This makes ucnv_getNextUChar() unsuitable for "streaming" conversion, * that is, for where the input is supplied in multiple buffers, * because ucnv_getNextUChar() will assume the end of the input at the end * of the first buffer. * - Does not provide offset output. * * It is possible to "mix" ucnv_getNextUChar() and ucnv_toUnicode() because * ucnv_getNextUChar() uses the current state of the converter * (unlike ucnv_toUChars() which always resets first). * However, if ucnv_getNextUChar() is called after ucnv_toUnicode() * stopped in the middle of a character sequence (with flush=false), * then ucnv_getNextUChar() will always use the slower ucnv_toUnicode() * internally until the next character boundary. * (This is new in ICU 2.6. In earlier releases, ucnv_getNextUChar() had to * start at a character boundary.) * * Instead of using ucnv_getNextUChar(), it is recommended * to convert using ucnv_toUnicode() or ucnv_toUChars() * and then iterate over the text using U16_NEXT() or a UCharIterator (uiter.h) * or a C++ CharacterIterator or similar. * This allows streaming conversion and offset output, for example. * *

Handling of surrogate pairs and supplementary-plane code points:
* There are two different kinds of codepages that provide mappings for surrogate characters: *

    *
  • Codepages like UTF-8, UTF-32, and GB 18030 provide direct representations for Unicode * code points U+10000-U+10ffff as well as for single surrogates U+d800-U+dfff. * Each valid sequence will result in exactly one returned code point. * If a sequence results in a single surrogate, then that will be returned * by itself, even if a neighboring sequence encodes the matching surrogate.
  • *
  • Codepages like SCSU and LMBCS (and UTF-16) provide direct representations only for BMP code points * including surrogates. Code points in supplementary planes are represented with * two sequences, each encoding a surrogate. * For these codepages, matching pairs of surrogates will be combined into single * code points for returning from this function. * (Note that SCSU is actually a mix of these codepage types.)
  • *

* * @param converter an open UConverter * @param source the address of a pointer to the codepage buffer, will be * updated to point after the bytes consumed in the conversion call. * @param sourceLimit points to the end of the input buffer * @param err fills in error status (see ucnv_toUnicode) * U_INDEX_OUTOFBOUNDS_ERROR will be set if the input * is empty or does not convert to any output (e.g.: pure state-change * codes SI/SO, escape sequences for ISO 2022, * or if the callback did not output anything, ...). * This function will not set a U_BUFFER_OVERFLOW_ERROR because * the "buffer" is the return code. However, there might be subsequent output * stored in the converter object * that will be returned in following calls to this function. * @return a UChar32 resulting from the partial conversion of source * @see ucnv_toUnicode * @see ucnv_toUChars * @see ucnv_convert * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 ucnv_getNextUChar(UConverter * converter, const char **source, const char * sourceLimit, UErrorCode * err); /** * Convert from one external charset to another using two existing UConverters. * Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() - * are used, "pivoting" through 16-bit Unicode. * * Important: For streaming conversion (multiple function calls for successive * parts of a text stream), the caller must provide a pivot buffer explicitly, * and must preserve the pivot buffer and associated pointers from one * call to another. (The buffer may be moved if its contents and the relative * pointer positions are preserved.) * * There is a similar function, ucnv_convert(), * which has the following limitations: * - it takes charset names, not converter objects, so that * - two converters are opened for each call * - only single-string conversion is possible, not streaming operation * - it does not provide enough information to find out, * in case of failure, whether the toUnicode or * the fromUnicode conversion failed * * By contrast, ucnv_convertEx() * - takes UConverter parameters instead of charset names * - fully exposes the pivot buffer for streaming conversion and complete error handling * * ucnv_convertEx() also provides further convenience: * - an option to reset the converters at the beginning * (if reset==true, see parameters; * also sets *pivotTarget=*pivotSource=pivotStart) * - allow NUL-terminated input * (only a single NUL byte, will not work for charsets with multi-byte NULs) * (if sourceLimit==NULL, see parameters) * - terminate with a NUL on output * (only a single NUL byte, not useful for charsets with multi-byte NULs), * or set U_STRING_NOT_TERMINATED_WARNING if the output exactly fills * the target buffer * - the pivot buffer can be provided internally; * possible only for whole-string conversion, not streaming conversion; * in this case, the caller will not be able to get details about where an * error occurred * (if pivotStart==NULL, see below) * * The function returns when one of the following is true: * - the entire source text has been converted successfully to the target buffer * - a target buffer overflow occurred (U_BUFFER_OVERFLOW_ERROR) * - a conversion error occurred * (other U_FAILURE(), see description of pErrorCode) * * Limitation compared to the direct use of * ucnv_fromUnicode() and ucnv_toUnicode(): * ucnv_convertEx() does not provide offset information. * * Limitation compared to ucnv_fromUChars() and ucnv_toUChars(): * ucnv_convertEx() does not support preflighting directly. * * Sample code for converting a single string from * one external charset to UTF-8, ignoring the location of errors: * * \code * int32_t * myToUTF8(UConverter *cnv, * const char *s, int32_t length, * char *u8, int32_t capacity, * UErrorCode *pErrorCode) { * UConverter *utf8Cnv; * char *target; * * if(U_FAILURE(*pErrorCode)) { * return 0; * } * * utf8Cnv=myGetCachedUTF8Converter(pErrorCode); * if(U_FAILURE(*pErrorCode)) { * return 0; * } * * if(length<0) { * length=strlen(s); * } * target=u8; * ucnv_convertEx(utf8Cnv, cnv, * &target, u8+capacity, * &s, s+length, * NULL, NULL, NULL, NULL, * true, true, * pErrorCode); * * myReleaseCachedUTF8Converter(utf8Cnv); * * // return the output string length, but without preflighting * return (int32_t)(target-u8); * } * \endcode * * @param targetCnv Output converter, used to convert from the UTF-16 pivot * to the target using ucnv_fromUnicode(). * @param sourceCnv Input converter, used to convert from the source to * the UTF-16 pivot using ucnv_toUnicode(). * @param target I/O parameter, same as for ucnv_fromUChars(). * Input: *target points to the beginning of the target buffer. * Output: *target points to the first unit after the last char written. * @param targetLimit Pointer to the first unit after the target buffer. * @param source I/O parameter, same as for ucnv_toUChars(). * Input: *source points to the beginning of the source buffer. * Output: *source points to the first unit after the last char read. * @param sourceLimit Pointer to the first unit after the source buffer. * @param pivotStart Pointer to the UTF-16 pivot buffer. If pivotStart==NULL, * then an internal buffer is used and the other pivot * arguments are ignored and can be NULL as well. * @param pivotSource I/O parameter, same as source in ucnv_fromUChars() for * conversion from the pivot buffer to the target buffer. * @param pivotTarget I/O parameter, same as target in ucnv_toUChars() for * conversion from the source buffer to the pivot buffer. * It must be pivotStart<=*pivotSource<=*pivotTarget<=pivotLimit * and pivotStart[0..ucnv_countAvaiable()]) * @return a pointer a string (library owned), or NULL if the index is out of bounds. * @see ucnv_countAvailable * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 ucnv_getAvailableName(int32_t n); /** * Returns a UEnumeration to enumerate all of the canonical converter * names, as per the alias file, regardless of the ability to open each * converter. * * @return A UEnumeration object for getting all the recognized canonical * converter names. * @see ucnv_getAvailableName * @see uenum_close * @see uenum_next * @stable ICU 2.4 */ U_CAPI UEnumeration * U_EXPORT2 ucnv_openAllNames(UErrorCode *pErrorCode); /** * Gives the number of aliases for a given converter or alias name. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * This method only enumerates the listed entries in the alias file. * @param alias alias name * @param pErrorCode error status * @return number of names on alias list for given alias * @stable ICU 2.0 */ U_CAPI uint16_t U_EXPORT2 ucnv_countAliases(const char *alias, UErrorCode *pErrorCode); /** * Gives the name of the alias at given index of alias list. * This method only enumerates the listed entries in the alias file. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * @param alias alias name * @param n index in alias list * @param pErrorCode result of operation * @return returns the name of the alias at given index * @see ucnv_countAliases * @stable ICU 2.0 */ U_CAPI const char * U_EXPORT2 ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode); /** * Fill-up the list of alias names for the given alias. * This method only enumerates the listed entries in the alias file. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * @param alias alias name * @param aliases fill-in list, aliases is a pointer to an array of * ucnv_countAliases() string-pointers * (const char *) that will be filled in. * The strings themselves are owned by the library. * @param pErrorCode result of operation * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode); /** * Return a new UEnumeration object for enumerating all the * alias names for a given converter that are recognized by a standard. * This method only enumerates the listed entries in the alias file. * The convrtrs.txt file can be modified to change the results of * this function. * The first result in this list is the same result given by * ucnv_getStandardName, which is the default alias for * the specified standard name. The returned object must be closed with * uenum_close when you are done with the object. * * @param convName original converter name * @param standard name of the standard governing the names; MIME and IANA * are such standards * @param pErrorCode The error code * @return A UEnumeration object for getting all aliases that are recognized * by a standard. If any of the parameters are invalid, NULL * is returned. * @see ucnv_getStandardName * @see uenum_close * @see uenum_next * @stable ICU 2.2 */ U_CAPI UEnumeration * U_EXPORT2 ucnv_openStandardNames(const char *convName, const char *standard, UErrorCode *pErrorCode); /** * Gives the number of standards associated to converter names. * @return number of standards * @stable ICU 2.0 */ U_CAPI uint16_t U_EXPORT2 ucnv_countStandards(void); /** * Gives the name of the standard at given index of standard list. * @param n index in standard list * @param pErrorCode result of operation * @return returns the name of the standard at given index. Owned by the library. * @stable ICU 2.0 */ U_CAPI const char * U_EXPORT2 ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode); /** * Returns a standard name for a given converter name. *

* Example alias table:
* conv alias1 { STANDARD1 } alias2 { STANDARD1* } *

* Result of ucnv_getStandardName("conv", "STANDARD1") from example * alias table:
* "alias2" * * @param name original converter name * @param standard name of the standard governing the names; MIME and IANA * are such standards * @param pErrorCode result of operation * @return returns the standard converter name; * if a standard converter name cannot be determined, * then NULL is returned. Owned by the library. * @stable ICU 2.0 */ U_CAPI const char * U_EXPORT2 ucnv_getStandardName(const char *name, const char *standard, UErrorCode *pErrorCode); /** * This function will return the internal canonical converter name of the * tagged alias. This is the opposite of ucnv_openStandardNames, which * returns the tagged alias given the canonical name. *

* Example alias table:
* conv alias1 { STANDARD1 } alias2 { STANDARD1* } *

* Result of ucnv_getStandardName("alias1", "STANDARD1") from example * alias table:
* "conv" * * @return returns the canonical converter name; * if a standard or alias name cannot be determined, * then NULL is returned. The returned string is * owned by the library. * @see ucnv_getStandardName * @stable ICU 2.4 */ U_CAPI const char * U_EXPORT2 ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErrorCode); /** * Returns the current default converter name. If you want to open * a default converter, you do not need to use this function. * It is faster if you pass a NULL argument to ucnv_open the * default converter. * * If U_CHARSET_IS_UTF8 is defined to 1 in utypes.h then this function * always returns "UTF-8". * * @return returns the current default converter name. * Storage owned by the library * @see ucnv_setDefaultName * @stable ICU 2.0 */ U_CAPI const char * U_EXPORT2 ucnv_getDefaultName(void); #ifndef U_HIDE_SYSTEM_API /** * This function is not thread safe. DO NOT call this function when ANY ICU * function is being used from more than one thread! This function sets the * current default converter name. If this function needs to be called, it * should be called during application initialization. Most of the time, the * results from ucnv_getDefaultName() or ucnv_open with a NULL string argument * is sufficient for your application. * * If U_CHARSET_IS_UTF8 is defined to 1 in utypes.h then this function * does nothing. * * @param name the converter name to be the default (must be known by ICU). * @see ucnv_getDefaultName * @system * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_setDefaultName(const char *name); #endif /* U_HIDE_SYSTEM_API */ /** * Fixes the backslash character mismapping. For example, in SJIS, the backslash * character in the ASCII portion is also used to represent the yen currency sign. * When mapping from Unicode character 0x005C, it's unclear whether to map the * character back to yen or backslash in SJIS. This function will take the input * buffer and replace all the yen sign characters with backslash. This is necessary * when the user tries to open a file with the input buffer on Windows. * This function will test the converter to see whether such mapping is * required. You can sometimes avoid using this function by using the correct version * of Shift-JIS. * * @param cnv The converter representing the target codepage. * @param source the input buffer to be fixed * @param sourceLen the length of the input buffer * @see ucnv_isAmbiguous * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_fixFileSeparator(const UConverter *cnv, UChar *source, int32_t sourceLen); /** * Determines if the converter contains ambiguous mappings of the same * character or not. * @param cnv the converter to be tested * @return true if the converter contains ambiguous mapping of the same * character, false otherwise. * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ucnv_isAmbiguous(const UConverter *cnv); /** * Sets the converter to use fallback mappings or not. * Regardless of this flag, the converter will always use * fallbacks from Unicode Private Use code points, as well as * reverse fallbacks (to Unicode). * For details see ".ucm File Format" * in the Conversion Data chapter of the ICU User Guide: * https://unicode-org.github.io/icu/userguide/conversion/data.html#ucm-file-format * * @param cnv The converter to set the fallback mapping usage on. * @param usesFallback true if the user wants the converter to take advantage of the fallback * mapping, false otherwise. * @stable ICU 2.0 * @see ucnv_usesFallback */ U_CAPI void U_EXPORT2 ucnv_setFallback(UConverter *cnv, UBool usesFallback); /** * Determines if the converter uses fallback mappings or not. * This flag has restrictions, see ucnv_setFallback(). * * @param cnv The converter to be tested * @return true if the converter uses fallback, false otherwise. * @stable ICU 2.0 * @see ucnv_setFallback */ U_CAPI UBool U_EXPORT2 ucnv_usesFallback(const UConverter *cnv); /** * Detects Unicode signature byte sequences at the start of the byte stream * and returns the charset name of the indicated Unicode charset. * NULL is returned when no Unicode signature is recognized. * The number of bytes in the signature is output as well. * * The caller can ucnv_open() a converter using the charset name. * The first code unit (UChar) from the start of the stream will be U+FEFF * (the Unicode BOM/signature character) and can usually be ignored. * * For most Unicode charsets it is also possible to ignore the indicated * number of initial stream bytes and start converting after them. * However, there are stateful Unicode charsets (UTF-7 and BOCU-1) for which * this will not work. Therefore, it is best to ignore the first output UChar * instead of the input signature bytes. *

* Usage: * \snippet samples/ucnv/convsamp.cpp ucnv_detectUnicodeSignature * * @param source The source string in which the signature should be detected. * @param sourceLength Length of the input string, or -1 if terminated with a NUL byte. * @param signatureLength A pointer to int32_t to receive the number of bytes that make up the signature * of the detected UTF. 0 if not detected. * Can be a NULL pointer. * @param pErrorCode ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * @return The name of the encoding detected. NULL if encoding is not detected. * @stable ICU 2.4 */ U_CAPI const char* U_EXPORT2 ucnv_detectUnicodeSignature(const char* source, int32_t sourceLength, int32_t *signatureLength, UErrorCode *pErrorCode); /** * Returns the number of UChars held in the converter's internal state * because more input is needed for completing the conversion. This function is * useful for mapping semantics of ICU's converter interface to those of iconv, * and this information is not needed for normal conversion. * @param cnv The converter in which the input is held * @param status ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * @return The number of UChars in the state. -1 if an error is encountered. * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 ucnv_fromUCountPending(const UConverter* cnv, UErrorCode* status); /** * Returns the number of chars held in the converter's internal state * because more input is needed for completing the conversion. This function is * useful for mapping semantics of ICU's converter interface to those of iconv, * and this information is not needed for normal conversion. * @param cnv The converter in which the input is held as internal state * @param status ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * @return The number of chars in the state. -1 if an error is encountered. * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 ucnv_toUCountPending(const UConverter* cnv, UErrorCode* status); /** * Returns whether or not the charset of the converter has a fixed number of bytes * per charset character. * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS. * Another example is UTF-32 which is always 4 bytes per character. * A Unicode code point may be represented by more than one UTF-8 or UTF-16 code unit * but a UTF-32 converter encodes each code point with 4 bytes. * Note: This method is not intended to be used to determine whether the charset has a * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form. * false is returned with the UErrorCode if error occurs or cnv is NULL. * @param cnv The converter to be tested * @param status ICU error code in/out parameter * @return true if the converter is fixed-width * @stable ICU 4.8 */ U_CAPI UBool U_EXPORT2 ucnv_isFixedWidth(UConverter *cnv, UErrorCode *status); #endif #endif /*_UCNV*/ // ucnv_cb.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2000-2004, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * ucnv_cb.h: * External APIs for the ICU's codeset conversion library * Helena Shih * * Modification History: * * Date Name Description */ /** * \file * \brief C API: UConverter functions to aid the writers of callbacks * *

Callback API for UConverter

* * These functions are provided here for the convenience of the callback * writer. If you are just looking for callback functions to use, please * see ucnv_err.h. DO NOT call these functions directly when you are * working with converters, unless your code has been called as a callback * via ucnv_setFromUCallback or ucnv_setToUCallback !! * * A note about error codes and overflow. Unlike other ICU functions, * these functions do not expect the error status to be U_ZERO_ERROR. * Callbacks must be much more careful about their error codes. * The error codes used here are in/out parameters, which should be passed * back in the callback's error parameter. * * For example, if you call ucnv_cbfromUWriteBytes to write data out * to the output codepage, it may return U_BUFFER_OVERFLOW_ERROR if * the data did not fit in the target. But this isn't a failing error, * in fact, ucnv_cbfromUWriteBytes may be called AGAIN with the error * status still U_BUFFER_OVERFLOW_ERROR to attempt to write further bytes, * which will also go into the internal overflow buffers. * * Concerning offsets, the 'offset' parameters here are relative to the start * of SOURCE. For example, Suppose the string "ABCD" was being converted * from Unicode into a codepage which doesn't have a mapping for 'B'. * 'A' will be written out correctly, but * The FromU Callback will be called on an unassigned character for 'B'. * At this point, this is the state of the world: * Target: A [..] [points after A] * Source: A B [C] D [points to C - B has been consumed] * 0 1 2 3 * codePoint = "B" [the unassigned codepoint] * * Now, suppose a callback wants to write the substitution character '?' to * the target. It calls ucnv_cbFromUWriteBytes() to write the ?. * It should pass ZERO as the offset, because the offset as far as the * callback is concerned is relative to the SOURCE pointer [which points * before 'C'.] If the callback goes into the args and consumes 'C' also, * it would call FromUWriteBytes with an offset of 1 (and advance the source * pointer). * */ #ifndef UCNV_CB_H #define UCNV_CB_H #if !UCONFIG_NO_CONVERSION /** * ONLY used by FromU callback functions. * Writes out the specified byte output bytes to the target byte buffer or to converter internal buffers. * * @param args callback fromUnicode arguments * @param source source bytes to write * @param length length of bytes to write * @param offsetIndex the relative offset index from callback. * @param err error status. If U_BUFFER_OVERFLOW is returned, then U_BUFFER_OVERFLOW must * be returned to the user, because it means that not all data could be written into the target buffer, and some is * in the converter error buffer. * @see ucnv_cbFromUWriteSub * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_cbFromUWriteBytes (UConverterFromUnicodeArgs *args, const char* source, int32_t length, int32_t offsetIndex, UErrorCode * err); /** * ONLY used by FromU callback functions. * This function will write out the correct substitution character sequence * to the target. * * @param args callback fromUnicode arguments * @param offsetIndex the relative offset index from the current source pointer to be used * @param err error status. If U_BUFFER_OVERFLOW is returned, then U_BUFFER_OVERFLOW must * be returned to the user, because it means that not all data could be written into the target buffer, and some is * in the converter error buffer. * @see ucnv_cbFromUWriteBytes * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_cbFromUWriteSub (UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode * err); /** * ONLY used by fromU callback functions. * This function will write out the error character(s) to the target UChar buffer. * * @param args callback fromUnicode arguments * @param source pointer to pointer to first UChar to write [on exit: 1 after last UChar processed] * @param sourceLimit pointer after last UChar to write * @param offsetIndex the relative offset index from callback which will be set * @param err error status U_BUFFER_OVERFLOW * @see ucnv_cbToUWriteSub * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_cbFromUWriteUChars(UConverterFromUnicodeArgs *args, const UChar** source, const UChar* sourceLimit, int32_t offsetIndex, UErrorCode * err); /** * ONLY used by ToU callback functions. * This function will write out the specified characters to the target * UChar buffer. * * @param args callback toUnicode arguments * @param source source string to write * @param length the length of source string * @param offsetIndex the relative offset index which will be written. * @param err error status U_BUFFER_OVERFLOW * @see ucnv_cbToUWriteSub * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_cbToUWriteUChars (UConverterToUnicodeArgs *args, const UChar* source, int32_t length, int32_t offsetIndex, UErrorCode * err); /** * ONLY used by ToU callback functions. * This function will write out the Unicode substitution character (U+FFFD). * * @param args callback fromUnicode arguments * @param offsetIndex the relative offset index from callback. * @param err error status U_BUFFER_OVERFLOW * @see ucnv_cbToUWriteUChars * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucnv_cbToUWriteSub (UConverterToUnicodeArgs *args, int32_t offsetIndex, UErrorCode * err); #endif #endif // uclean.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2001-2014, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * file name: uclean.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2001July05 * created by: George Rhoten */ #ifndef __UCLEAN_H__ #define __UCLEAN_H__ /** * \file * \brief C API: Initialize and clean up ICU */ /** * Initialize ICU. * * Use of this function is optional. It is OK to simply use ICU * services and functions without first having initialized * ICU by calling u_init(). * * u_init() will attempt to load some part of ICU's data, and is * useful as a test for configuration or installation problems that * leave the ICU data inaccessible. A successful invocation of u_init() * does not, however, guarantee that all ICU data is accessible. * * Multiple calls to u_init() cause no harm, aside from the small amount * of time required. * * In old versions of ICU, u_init() was required in multi-threaded applications * to ensure the thread safety of ICU. u_init() is no longer needed for this purpose. * * @param status An ICU UErrorCode parameter. It must not be NULL. * An Error will be returned if some required part of ICU data can not * be loaded or initialized. * The function returns immediately if the input error code indicates a * failure, as usual. * * @stable ICU 2.6 */ U_CAPI void U_EXPORT2 u_init(UErrorCode *status); #ifndef U_HIDE_SYSTEM_API /** * Clean up the system resources, such as allocated memory or open files, * used in all ICU libraries. This will free/delete all memory owned by the * ICU libraries, and return them to their original load state. All open ICU * items (collators, resource bundles, converters, etc.) must be closed before * calling this function, otherwise ICU may not free its allocated memory * (e.g. close your converters and resource bundles before calling this * function). Generally, this function should be called once just before * an application exits. For applications that dynamically load and unload * the ICU libraries (relatively uncommon), u_cleanup() should be called * just before the library unload. *

* u_cleanup() also clears any ICU heap functions, mutex functions or * trace functions that may have been set for the process. * This has the effect of restoring ICU to its initial condition, before * any of these override functions were installed. Refer to * u_setMemoryFunctions(), u_setMutexFunctions and * utrace_setFunctions(). If ICU is to be reinitialized after * calling u_cleanup(), these runtime override functions will need to * be set up again if they are still required. *

* u_cleanup() is not thread safe. All other threads should stop using ICU * before calling this function. *

* Any open ICU items will be left in an undefined state by u_cleanup(), * and any subsequent attempt to use such an item will give unpredictable * results. *

* After calling u_cleanup(), an application may continue to use ICU by * calling u_init(). An application must invoke u_init() first from one single * thread before allowing other threads call u_init(). All threads existing * at the time of the first thread's call to u_init() must also call * u_init() themselves before continuing with other ICU operations. *

* The use of u_cleanup() just before an application terminates is optional, * but it should be called only once for performance reasons. The primary * benefit is to eliminate reports of memory or resource leaks originating * in ICU code from the results generated by heap analysis tools. *

* Use this function with great care! *

* * @stable ICU 2.0 * @system */ U_CAPI void U_EXPORT2 u_cleanup(void); U_CDECL_BEGIN /** * Pointer type for a user supplied memory allocation function. * @param context user supplied value, obtained from u_setMemoryFunctions(). * @param size The number of bytes to be allocated * @return Pointer to the newly allocated memory, or NULL if the allocation failed. * @stable ICU 2.8 * @system */ typedef void *U_CALLCONV UMemAllocFn(const void *context, size_t size); /** * Pointer type for a user supplied memory re-allocation function. * @param context user supplied value, obtained from u_setMemoryFunctions(). * @param mem Pointer to the memory block to be resized. * @param size The new size for the block. * @return Pointer to the newly allocated memory, or NULL if the allocation failed. * @stable ICU 2.8 * @system */ typedef void *U_CALLCONV UMemReallocFn(const void *context, void *mem, size_t size); /** * Pointer type for a user supplied memory free function. Behavior should be * similar the standard C library free(). * @param context user supplied value, obtained from u_setMemoryFunctions(). * @param mem Pointer to the memory block to be freed. * @return Pointer to the resized memory block, or NULL if the resizing failed. * @stable ICU 2.8 * @system */ typedef void U_CALLCONV UMemFreeFn (const void *context, void *mem); /** * Set the functions that ICU will use for memory allocation. * Use of this function is optional; by default (without this function), ICU will * use the standard C library malloc() and free() functions. * This function can only be used when ICU is in an initial, unused state, before * u_init() has been called. * @param context This pointer value will be saved, and then (later) passed as * a parameter to the memory functions each time they * are called. * @param a Pointer to a user-supplied malloc function. * @param r Pointer to a user-supplied realloc function. * @param f Pointer to a user-supplied free function. * @param status Receives error values. * @stable ICU 2.8 * @system */ U_CAPI void U_EXPORT2 u_setMemoryFunctions(const void *context, UMemAllocFn * U_CALLCONV_FPTR a, UMemReallocFn * U_CALLCONV_FPTR r, UMemFreeFn * U_CALLCONV_FPTR f, UErrorCode *status); U_CDECL_END #endif /* U_HIDE_SYSTEM_API */ #endif // ucat.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2003-2004, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: March 19 2003 * Since: ICU 2.6 ********************************************************************** */ #ifndef UCAT_H #define UCAT_H /** * \file * \brief C API: Message Catalog Wrappers * * This C API provides look-alike functions that deliberately resemble * the POSIX catopen, catclose, and catgets functions. The underlying * implementation is in terms of ICU resource bundles, rather than * POSIX message catalogs. * * The ICU resource bundles obey standard ICU inheritance policies. * To facilitate this, sets and messages are flattened into one tier. * This is done by creating resource bundle keys of the form * <set_num>%<msg_num> where set_num is the set number and msg_num is * the message number, formatted as decimal strings. * * Example: Consider a message catalog containing two sets: * * Set 1: Message 4 = "Good morning." * Message 5 = "Good afternoon." * Message 7 = "Good evening." * Message 8 = "Good night." * Set 4: Message 14 = "Please " * Message 19 = "Thank you." * Message 20 = "Sincerely," * * The ICU resource bundle source file would, assuming it is named * "greet.txt", would look like this: * * greet * { * 1%4 { "Good morning." } * 1%5 { "Good afternoon." } * 1%7 { "Good evening." } * 1%8 { "Good night." } * * 4%14 { "Please " } * 4%19 { "Thank you." } * 4%20 { "Sincerely," } * } * * The catgets function is commonly used in combination with functions * like printf and strftime. ICU components like message format can * be used instead, although they use a different format syntax. * There is an ICU package, icuio, that provides some of * the POSIX-style formatting API. */ U_CDECL_BEGIN /** * An ICU message catalog descriptor, analogous to nl_catd. * * @stable ICU 2.6 */ typedef UResourceBundle* u_nl_catd; /** * Open and return an ICU message catalog descriptor. The descriptor * may be passed to u_catgets() to retrieve localized strings. * * @param name string containing the full path pointing to the * directory where the resources reside followed by the package name * e.g. "/usr/resource/my_app/resources/guimessages" on a Unix system. * If NULL, ICU default data files will be used. * * Unlike POSIX, environment variables are not interpolated within the * name. * * @param locale the locale for which we want to open the resource. If * NULL, the default ICU locale will be used (see uloc_getDefault). If * strlen(locale) == 0, the root locale will be used. * * @param ec input/output error code. Upon output, * U_USING_FALLBACK_WARNING indicates that a fallback locale was * used. For example, 'de_CH' was requested, but nothing was found * there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that the * default locale data or root locale data was used; neither the * requested locale nor any of its fallback locales were found. * * @return a message catalog descriptor that may be passed to * u_catgets(). If the ec parameter indicates success, then the caller * is responsible for calling u_catclose() to close the message * catalog. If the ec parameter indicates failure, then NULL will be * returned. * * @stable ICU 2.6 */ U_CAPI u_nl_catd U_EXPORT2 u_catopen(const char* name, const char* locale, UErrorCode* ec); /** * Close an ICU message catalog, given its descriptor. * * @param catd a message catalog descriptor to be closed. May be NULL, * in which case no action is taken. * * @stable ICU 2.6 */ U_CAPI void U_EXPORT2 u_catclose(u_nl_catd catd); /** * Retrieve a localized string from an ICU message catalog. * * @param catd a message catalog descriptor returned by u_catopen. * * @param set_num the message catalog set number. Sets need not be * numbered consecutively. * * @param msg_num the message catalog message number within the * set. Messages need not be numbered consecutively. * * @param s the default string. This is returned if the string * specified by the set_num and msg_num is not found. It must be * zero-terminated. * * @param len fill-in parameter to receive the length of the result. * May be NULL, in which case it is ignored. * * @param ec input/output error code. May be U_USING_FALLBACK_WARNING * or U_USING_DEFAULT_WARNING. U_MISSING_RESOURCE_ERROR indicates that * the set_num/msg_num tuple does not specify a valid message string * in this catalog. * * @return a pointer to a zero-terminated UChar array which lives in * an internal buffer area, typically a memory mapped/DLL file. The * caller must NOT delete this pointer. If the call is unsuccessful * for any reason, then s is returned. This includes the situation in * which ec indicates a failing error code upon entry to this * function. * * @stable ICU 2.6 */ U_CAPI const UChar* U_EXPORT2 u_catgets(u_nl_catd catd, int32_t set_num, int32_t msg_num, const UChar* s, int32_t* len, UErrorCode* ec); U_CDECL_END #endif /*UCAT_H*/ /*eof*/ // stringoptions.h // Copyright (C) 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // stringoptions.h // created: 2017jun08 Markus W. Scherer #ifndef __STRINGOPTIONS_H__ #define __STRINGOPTIONS_H__ /** * \file * \brief C API: Bit set option bit constants for various string and character processing functions. */ /** * Option value for case folding: Use default mappings defined in CaseFolding.txt. * * @stable ICU 2.0 */ #define U_FOLD_CASE_DEFAULT 0 /** * Option value for case folding: * * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I * and dotless i appropriately for Turkic languages (tr, az). * * Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that * are to be included for default mappings and * excluded for the Turkic-specific mappings. * * Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that * are to be excluded for default mappings and * included for the Turkic-specific mappings. * * @stable ICU 2.0 */ #define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1 #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** * Titlecase the string as a whole rather than each word. * (Titlecase only the character at index 0, possibly adjusted.) * Option bits value for titlecasing APIs that take an options bit set. * * It is an error to specify multiple titlecasing iterator options together, * including both an options bit and an explicit BreakIterator. * * @see U_TITLECASE_ADJUST_TO_CASED * @stable ICU 60 */ #define U_TITLECASE_WHOLE_STRING 0x20 /** * Titlecase sentences rather than words. * (Titlecase only the first character of each sentence, possibly adjusted.) * Option bits value for titlecasing APIs that take an options bit set. * * It is an error to specify multiple titlecasing iterator options together, * including both an options bit and an explicit BreakIterator. * * @see U_TITLECASE_ADJUST_TO_CASED * @stable ICU 60 */ #define U_TITLECASE_SENTENCES 0x40 #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** * Do not lowercase non-initial parts of words when titlecasing. * Option bit for titlecasing APIs that take an options bit set. * * By default, titlecasing will titlecase the character at each * (possibly adjusted) BreakIterator index and * lowercase all other characters up to the next iterator index. * With this option, the other characters will not be modified. * * @see U_TITLECASE_ADJUST_TO_CASED * @see UnicodeString::toTitle * @see CaseMap::toTitle * @see ucasemap_setOptions * @see ucasemap_toTitle * @see ucasemap_utf8ToTitle * @stable ICU 3.8 */ #define U_TITLECASE_NO_LOWERCASE 0x100 /** * Do not adjust the titlecasing BreakIterator indexes; * titlecase exactly the characters at breaks from the iterator. * Option bit for titlecasing APIs that take an options bit set. * * By default, titlecasing will take each break iterator index, * adjust it to the next relevant character (see U_TITLECASE_ADJUST_TO_CASED), * and titlecase that one. * * Other characters are lowercased. * * It is an error to specify multiple titlecasing adjustment options together. * * @see U_TITLECASE_ADJUST_TO_CASED * @see U_TITLECASE_NO_LOWERCASE * @see UnicodeString::toTitle * @see CaseMap::toTitle * @see ucasemap_setOptions * @see ucasemap_toTitle * @see ucasemap_utf8ToTitle * @stable ICU 3.8 */ #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200 #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** * Adjust each titlecasing BreakIterator index to the next cased character. * (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).) * Option bit for titlecasing APIs that take an options bit set. * * This used to be the default index adjustment in ICU. * Since ICU 60, the default index adjustment is to the next character that is * a letter, number, symbol, or private use code point. * (Uncased modifier letters are skipped.) * The difference in behavior is small for word titlecasing, * but the new adjustment is much better for whole-string and sentence titlecasing: * It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»". * * It is an error to specify multiple titlecasing adjustment options together. * * @see U_TITLECASE_NO_BREAK_ADJUSTMENT * @stable ICU 60 */ #define U_TITLECASE_ADJUST_TO_CASED 0x400 /** * Option for string transformation functions to not first reset the Edits object. * Used for example in some case-mapping and normalization functions. * * @see CaseMap * @see Edits * @see Normalizer2 * @stable ICU 60 */ #define U_EDITS_NO_RESET 0x2000 /** * Omit unchanged text when recording how source substrings * relate to changed and unchanged result substrings. * Used for example in some case-mapping and normalization functions. * * @see CaseMap * @see Edits * @see Normalizer2 * @stable ICU 60 */ #define U_OMIT_UNCHANGED_TEXT 0x4000 #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc: * Compare strings in code point order instead of code unit order. * @stable ICU 2.2 */ #define U_COMPARE_CODE_POINT_ORDER 0x8000 /** * Option bit for unorm_compare: * Perform case-insensitive comparison. * @stable ICU 2.2 */ #define U_COMPARE_IGNORE_CASE 0x10000 /** * Option bit for unorm_compare: * Both input strings are assumed to fulfill FCD conditions. * @stable ICU 2.2 */ #define UNORM_INPUT_IS_FCD 0x20000 // Related definitions elsewhere. // Options that are not meaningful in the same functions // can share the same bits. // // Public: // unicode/unorm.h #define UNORM_COMPARE_NORM_OPTIONS_SHIFT 20 // // Internal: (may change or be removed) // ucase.h #define _STRCASECMP_OPTIONS_MASK 0xffff // ucase.h #define _FOLD_CASE_OPTIONS_MASK 7 // ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0 // ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600 // ustr_imp.h #define _STRNCMP_STYLE 0x1000 // unormcmp.cpp #define _COMPARE_EQUIV 0x80000 #endif // __STRINGOPTIONS_H__ // uchar.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File UCHAR.H * * Modification History: * * Date Name Description * 04/02/97 aliu Creation. * 03/29/99 helena Updated for C APIs. * 4/15/99 Madhu Updated for C Implementation and Javadoc * 5/20/99 Madhu Added the function u_getVersion() * 8/19/1999 srl Upgraded scripts to Unicode 3.0 * 8/27/1999 schererm UCharDirection constants: U_... * 11/11/1999 weiv added u_isalnum(), cleaned comments * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion(). ****************************************************************************** */ #ifndef UCHAR_H #define UCHAR_H #if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN) #define USET_DEFINED /** * USet is the C API type corresponding to C++ class UnicodeSet. * It is forward-declared here to avoid including unicode/uset.h file if related * APIs are not used. * * @see ucnv_getUnicodeSet * @stable ICU 2.4 */ typedef struct USet USet; #endif U_CDECL_BEGIN /** * \file * \brief C API: Unicode Properties * * This C API provides low-level access to the Unicode Character Database. * In addition to raw property values, some convenience functions calculate * derived properties, for example for Java-style programming. * * Unicode assigns each code point (not just assigned character) values for * many properties. * Most of them are simple boolean flags, or constants from a small enumerated list. * For some properties, values are strings or other relatively more complex types. * * For more information see * "About the Unicode Character Database" (http://www.unicode.org/ucd/) * and the ICU User Guide chapter on Properties (https://unicode-org.github.io/icu/userguide/strings/properties). * * Many properties are accessible via generic functions that take a UProperty selector. * - u_hasBinaryProperty() returns a binary value (true/false) per property and code point. * - u_getIntPropertyValue() returns an integer value per property and code point. * For each supported enumerated or catalog property, there is * an enum type for all of the property's values, and * u_getIntPropertyValue() returns the numeric values of those constants. * - u_getBinaryPropertySet() returns a set for each ICU-supported binary property with * all code points for which the property is true. * - u_getIntPropertyMap() returns a map for each * ICU-supported enumerated/catalog/int-valued property which * maps all Unicode code points to their values for that property. * * Many functions are designed to match java.lang.Character functions. * See the individual function documentation, * and see the JDK 1.4 java.lang.Character documentation * at http://java.sun.com/j2se/1.4/docs/api/java/lang/Character.html * * There are also functions that provide easy migration from C/POSIX functions * like isblank(). Their use is generally discouraged because the C/POSIX * standards do not define their semantics beyond the ASCII range, which means * that different implementations exhibit very different behavior. * Instead, Unicode properties should be used directly. * * There are also only a few, broad C/POSIX character classes, and they tend * to be used for conflicting purposes. For example, the "isalpha()" class * is sometimes used to determine word boundaries, while a more sophisticated * approach would at least distinguish initial letters from continuation * characters (the latter including combining marks). * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) * Another example: There is no "istitle()" class for titlecase characters. * * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. * ICU implements them according to the Standard Recommendations in * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). * * API access for C/POSIX character classes is as follows: * - alpha: u_isUAlphabetic(c) or u_hasBinaryProperty(c, UCHAR_ALPHABETIC) * - lower: u_isULowercase(c) or u_hasBinaryProperty(c, UCHAR_LOWERCASE) * - upper: u_isUUppercase(c) or u_hasBinaryProperty(c, UCHAR_UPPERCASE) * - punct: u_ispunct(c) * - digit: u_isdigit(c) or u_charType(c)==U_DECIMAL_DIGIT_NUMBER * - xdigit: u_isxdigit(c) or u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT) * - alnum: u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM) * - space: u_isUWhiteSpace(c) or u_hasBinaryProperty(c, UCHAR_WHITE_SPACE) * - blank: u_isblank(c) or u_hasBinaryProperty(c, UCHAR_POSIX_BLANK) * - cntrl: u_charType(c)==U_CONTROL_CHAR * - graph: u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH) * - print: u_hasBinaryProperty(c, UCHAR_POSIX_PRINT) * * Note: Some of the u_isxyz() functions in uchar.h predate, and do not match, * the Standard Recommendations in UTS #18. Instead, they match Java * functions according to their API documentation. * * \htmlonly * The C/POSIX character classes are also available in UnicodeSet patterns, * using patterns like [:graph:] or \p{graph}. * \endhtmlonly * * Note: There are several ICU whitespace functions. * Comparison: * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; * most of general categories "Z" (separators) + most whitespace ISO controls * (including no-break spaces, but excluding IS1..IS4) * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces) * - u_isspace: Z + whitespace ISO controls (including no-break spaces) * - u_isblank: "horizontal spaces" = TAB + Zs */ /** * Constants. */ /** The lowest Unicode code point value. Code points are non-negative. @stable ICU 2.0 */ #define UCHAR_MIN_VALUE 0 /** * The highest Unicode code point value (scalar value) according to * The Unicode Standard. This is a 21-bit value (20.1 bits, rounded up). * For a single character, UChar32 is a simple type that can hold any code point value. * * @see UChar32 * @stable ICU 2.0 */ #define UCHAR_MAX_VALUE 0x10ffff /** * Get a single-bit bit set (a flag) from a bit number 0..31. * @stable ICU 2.1 */ #define U_MASK(x) ((uint32_t)1<<(x)) /** * Selection constants for Unicode properties. * These constants are used in functions like u_hasBinaryProperty to select * one of the Unicode properties. * * The properties APIs are intended to reflect Unicode properties as defined * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). * * For details about the properties see * UAX #44: Unicode Character Database (http://www.unicode.org/reports/tr44/). * * Important: If ICU is built with UCD files from Unicode versions below, e.g., 3.2, * then properties marked with "new in Unicode 3.2" are not or not fully available. * Check u_getUnicodeVersion to be sure. * * @see u_hasBinaryProperty * @see u_getIntPropertyValue * @see u_getUnicodeVersion * @stable ICU 2.1 */ typedef enum UProperty { /* * Note: UProperty constants are parsed by preparseucd.py. * It matches lines like * UCHAR_=, */ /* Note: Place UCHAR_ALPHABETIC before UCHAR_BINARY_START so that debuggers display UCHAR_ALPHABETIC as the symbolic name for 0, rather than UCHAR_BINARY_START. Likewise for other *_START identifiers. */ /** Binary property Alphabetic. Same as u_isUAlphabetic, different from u_isalpha. Lu+Ll+Lt+Lm+Lo+Nl+Other_Alphabetic @stable ICU 2.1 */ UCHAR_ALPHABETIC=0, /** First constant for binary Unicode properties. @stable ICU 2.1 */ UCHAR_BINARY_START=UCHAR_ALPHABETIC, /** Binary property ASCII_Hex_Digit. 0-9 A-F a-f @stable ICU 2.1 */ UCHAR_ASCII_HEX_DIGIT=1, /** Binary property Bidi_Control. Format controls which have specific functions in the Bidi Algorithm. @stable ICU 2.1 */ UCHAR_BIDI_CONTROL=2, /** Binary property Bidi_Mirrored. Characters that may change display in RTL text. Same as u_isMirrored. See Bidi Algorithm, UTR 9. @stable ICU 2.1 */ UCHAR_BIDI_MIRRORED=3, /** Binary property Dash. Variations of dashes. @stable ICU 2.1 */ UCHAR_DASH=4, /** Binary property Default_Ignorable_Code_Point (new in Unicode 3.2). Ignorable in most processing. <2060..206F, FFF0..FFFB, E0000..E0FFF>+Other_Default_Ignorable_Code_Point+(Cf+Cc+Cs-White_Space) @stable ICU 2.1 */ UCHAR_DEFAULT_IGNORABLE_CODE_POINT=5, /** Binary property Deprecated (new in Unicode 3.2). The usage of deprecated characters is strongly discouraged. @stable ICU 2.1 */ UCHAR_DEPRECATED=6, /** Binary property Diacritic. Characters that linguistically modify the meaning of another character to which they apply. @stable ICU 2.1 */ UCHAR_DIACRITIC=7, /** Binary property Extender. Extend the value or shape of a preceding alphabetic character, e.g., length and iteration marks. @stable ICU 2.1 */ UCHAR_EXTENDER=8, /** Binary property Full_Composition_Exclusion. CompositionExclusions.txt+Singleton Decompositions+ Non-Starter Decompositions. @stable ICU 2.1 */ UCHAR_FULL_COMPOSITION_EXCLUSION=9, /** Binary property Grapheme_Base (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. [0..10FFFF]-Cc-Cf-Cs-Co-Cn-Zl-Zp-Grapheme_Link-Grapheme_Extend-CGJ @stable ICU 2.1 */ UCHAR_GRAPHEME_BASE=10, /** Binary property Grapheme_Extend (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. Me+Mn+Mc+Other_Grapheme_Extend-Grapheme_Link-CGJ @stable ICU 2.1 */ UCHAR_GRAPHEME_EXTEND=11, /** Binary property Grapheme_Link (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. @stable ICU 2.1 */ UCHAR_GRAPHEME_LINK=12, /** Binary property Hex_Digit. Characters commonly used for hexadecimal numbers. @stable ICU 2.1 */ UCHAR_HEX_DIGIT=13, /** Binary property Hyphen. Dashes used to mark connections between pieces of words, plus the Katakana middle dot. @stable ICU 2.1 */ UCHAR_HYPHEN=14, /** Binary property ID_Continue. Characters that can continue an identifier. DerivedCoreProperties.txt also says "NOTE: Cf characters should be filtered out." ID_Start+Mn+Mc+Nd+Pc @stable ICU 2.1 */ UCHAR_ID_CONTINUE=15, /** Binary property ID_Start. Characters that can start an identifier. Lu+Ll+Lt+Lm+Lo+Nl @stable ICU 2.1 */ UCHAR_ID_START=16, /** Binary property Ideographic. CJKV ideographs. @stable ICU 2.1 */ UCHAR_IDEOGRAPHIC=17, /** Binary property IDS_Binary_Operator (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ UCHAR_IDS_BINARY_OPERATOR=18, /** Binary property IDS_Trinary_Operator (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ UCHAR_IDS_TRINARY_OPERATOR=19, /** Binary property Join_Control. Format controls for cursive joining and ligation. @stable ICU 2.1 */ UCHAR_JOIN_CONTROL=20, /** Binary property Logical_Order_Exception (new in Unicode 3.2). Characters that do not use logical order and require special handling in most processing. @stable ICU 2.1 */ UCHAR_LOGICAL_ORDER_EXCEPTION=21, /** Binary property Lowercase. Same as u_isULowercase, different from u_islower. Ll+Other_Lowercase @stable ICU 2.1 */ UCHAR_LOWERCASE=22, /** Binary property Math. Sm+Other_Math @stable ICU 2.1 */ UCHAR_MATH=23, /** Binary property Noncharacter_Code_Point. Code points that are explicitly defined as illegal for the encoding of characters. @stable ICU 2.1 */ UCHAR_NONCHARACTER_CODE_POINT=24, /** Binary property Quotation_Mark. @stable ICU 2.1 */ UCHAR_QUOTATION_MARK=25, /** Binary property Radical (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ UCHAR_RADICAL=26, /** Binary property Soft_Dotted (new in Unicode 3.2). Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. @stable ICU 2.1 */ UCHAR_SOFT_DOTTED=27, /** Binary property Terminal_Punctuation. Punctuation characters that generally mark the end of textual units. @stable ICU 2.1 */ UCHAR_TERMINAL_PUNCTUATION=28, /** Binary property Unified_Ideograph (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ UCHAR_UNIFIED_IDEOGRAPH=29, /** Binary property Uppercase. Same as u_isUUppercase, different from u_isupper. Lu+Other_Uppercase @stable ICU 2.1 */ UCHAR_UPPERCASE=30, /** Binary property White_Space. Same as u_isUWhiteSpace, different from u_isspace and u_isWhitespace. Space characters+TAB+CR+LF-ZWSP-ZWNBSP @stable ICU 2.1 */ UCHAR_WHITE_SPACE=31, /** Binary property XID_Continue. ID_Continue modified to allow closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ UCHAR_XID_CONTINUE=32, /** Binary property XID_Start. ID_Start modified to allow closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ UCHAR_XID_START=33, /** Binary property Case_Sensitive. Either the source of a case mapping or _in_ the target of a case mapping. Not the same as the general category Cased_Letter. @stable ICU 2.6 */ UCHAR_CASE_SENSITIVE=34, /** Binary property STerm (new in Unicode 4.0.1). Sentence Terminal. Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) @stable ICU 3.0 */ UCHAR_S_TERM=35, /** Binary property Variation_Selector (new in Unicode 4.0.1). Indicates all those characters that qualify as Variation Selectors. For details on the behavior of these characters, see StandardizedVariants.html and 15.6 Variation Selectors. @stable ICU 3.0 */ UCHAR_VARIATION_SELECTOR=36, /** Binary property NFD_Inert. ICU-specific property for characters that are inert under NFD, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ UCHAR_NFD_INERT=37, /** Binary property NFKD_Inert. ICU-specific property for characters that are inert under NFKD, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ UCHAR_NFKD_INERT=38, /** Binary property NFC_Inert. ICU-specific property for characters that are inert under NFC, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ UCHAR_NFC_INERT=39, /** Binary property NFKC_Inert. ICU-specific property for characters that are inert under NFKC, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ UCHAR_NFKC_INERT=40, /** Binary Property Segment_Starter. ICU-specific property for characters that are starters in terms of Unicode normalization and combining character sequences. They have ccc=0 and do not occur in non-initial position of the canonical decomposition of any character (like a-umlaut in NFD and a Jamo T in an NFD(Hangul LVT)). ICU uses this property for segmenting a string for generating a set of canonically equivalent strings, e.g. for canonical closure while processing collation tailoring rules. @stable ICU 3.0 */ UCHAR_SEGMENT_STARTER=41, /** Binary property Pattern_Syntax (new in Unicode 4.1). See UAX #31 Identifier and Pattern Syntax (http://www.unicode.org/reports/tr31/) @stable ICU 3.4 */ UCHAR_PATTERN_SYNTAX=42, /** Binary property Pattern_White_Space (new in Unicode 4.1). See UAX #31 Identifier and Pattern Syntax (http://www.unicode.org/reports/tr31/) @stable ICU 3.4 */ UCHAR_PATTERN_WHITE_SPACE=43, /** Binary property alnum (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_ALNUM=44, /** Binary property blank (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_BLANK=45, /** Binary property graph (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_GRAPH=46, /** Binary property print (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_PRINT=47, /** Binary property xdigit (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_XDIGIT=48, /** Binary property Cased. For Lowercase, Uppercase and Titlecase characters. @stable ICU 4.4 */ UCHAR_CASED=49, /** Binary property Case_Ignorable. Used in context-sensitive case mappings. @stable ICU 4.4 */ UCHAR_CASE_IGNORABLE=50, /** Binary property Changes_When_Lowercased. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_LOWERCASED=51, /** Binary property Changes_When_Uppercased. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_UPPERCASED=52, /** Binary property Changes_When_Titlecased. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_TITLECASED=53, /** Binary property Changes_When_Casefolded. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_CASEFOLDED=54, /** Binary property Changes_When_Casemapped. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_CASEMAPPED=55, /** Binary property Changes_When_NFKC_Casefolded. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED=56, /** * Binary property Emoji. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 57 */ UCHAR_EMOJI=57, /** * Binary property Emoji_Presentation. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 57 */ UCHAR_EMOJI_PRESENTATION=58, /** * Binary property Emoji_Modifier. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 57 */ UCHAR_EMOJI_MODIFIER=59, /** * Binary property Emoji_Modifier_Base. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 57 */ UCHAR_EMOJI_MODIFIER_BASE=60, /** * Binary property Emoji_Component. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 60 */ UCHAR_EMOJI_COMPONENT=61, /** * Binary property Regional_Indicator. * @stable ICU 60 */ UCHAR_REGIONAL_INDICATOR=62, /** * Binary property Prepended_Concatenation_Mark. * @stable ICU 60 */ UCHAR_PREPENDED_CONCATENATION_MARK=63, #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** * Binary property Extended_Pictographic. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 62 */ UCHAR_EXTENDED_PICTOGRAPHIC=64, #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** Enumerated property Bidi_Class. Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */ UCHAR_BIDI_CLASS=0x1000, /** First constant for enumerated/integer Unicode properties. @stable ICU 2.2 */ UCHAR_INT_START=UCHAR_BIDI_CLASS, /** Enumerated property Block. Same as ublock_getCode, returns UBlockCode values. @stable ICU 2.2 */ UCHAR_BLOCK=0x1001, /** Enumerated property Canonical_Combining_Class. Same as u_getCombiningClass, returns 8-bit numeric values. @stable ICU 2.2 */ UCHAR_CANONICAL_COMBINING_CLASS=0x1002, /** Enumerated property Decomposition_Type. Returns UDecompositionType values. @stable ICU 2.2 */ UCHAR_DECOMPOSITION_TYPE=0x1003, /** Enumerated property East_Asian_Width. See http://www.unicode.org/reports/tr11/ Returns UEastAsianWidth values. @stable ICU 2.2 */ UCHAR_EAST_ASIAN_WIDTH=0x1004, /** Enumerated property General_Category. Same as u_charType, returns UCharCategory values. @stable ICU 2.2 */ UCHAR_GENERAL_CATEGORY=0x1005, /** Enumerated property Joining_Group. Returns UJoiningGroup values. @stable ICU 2.2 */ UCHAR_JOINING_GROUP=0x1006, /** Enumerated property Joining_Type. Returns UJoiningType values. @stable ICU 2.2 */ UCHAR_JOINING_TYPE=0x1007, /** Enumerated property Line_Break. Returns ULineBreak values. @stable ICU 2.2 */ UCHAR_LINE_BREAK=0x1008, /** Enumerated property Numeric_Type. Returns UNumericType values. @stable ICU 2.2 */ UCHAR_NUMERIC_TYPE=0x1009, /** Enumerated property Script. Same as uscript_getScript, returns UScriptCode values. @stable ICU 2.2 */ UCHAR_SCRIPT=0x100A, /** Enumerated property Hangul_Syllable_Type, new in Unicode 4. Returns UHangulSyllableType values. @stable ICU 2.6 */ UCHAR_HANGUL_SYLLABLE_TYPE=0x100B, /** Enumerated property NFD_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ UCHAR_NFD_QUICK_CHECK=0x100C, /** Enumerated property NFKD_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ UCHAR_NFKD_QUICK_CHECK=0x100D, /** Enumerated property NFC_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ UCHAR_NFC_QUICK_CHECK=0x100E, /** Enumerated property NFKC_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ UCHAR_NFKC_QUICK_CHECK=0x100F, /** Enumerated property Lead_Canonical_Combining_Class. ICU-specific property for the ccc of the first code point of the decomposition, or lccc(c)=ccc(NFD(c)[0]). Useful for checking for canonically ordered text; see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ UCHAR_LEAD_CANONICAL_COMBINING_CLASS=0x1010, /** Enumerated property Trail_Canonical_Combining_Class. ICU-specific property for the ccc of the last code point of the decomposition, or tccc(c)=ccc(NFD(c)[last]). Useful for checking for canonically ordered text; see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ UCHAR_TRAIL_CANONICAL_COMBINING_CLASS=0x1011, /** Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns UGraphemeClusterBreak values. @stable ICU 3.4 */ UCHAR_GRAPHEME_CLUSTER_BREAK=0x1012, /** Enumerated property Sentence_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns USentenceBreak values. @stable ICU 3.4 */ UCHAR_SENTENCE_BREAK=0x1013, /** Enumerated property Word_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns UWordBreakValues values. @stable ICU 3.4 */ UCHAR_WORD_BREAK=0x1014, /** Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). Used in UAX #9: Unicode Bidirectional Algorithm (http://www.unicode.org/reports/tr9/) Returns UBidiPairedBracketType values. @stable ICU 52 */ UCHAR_BIDI_PAIRED_BRACKET_TYPE=0x1015, #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** * Enumerated property Indic_Positional_Category. * New in Unicode 6.0 as provisional property Indic_Matra_Category; * renamed and changed to informative in Unicode 8.0. * See http://www.unicode.org/reports/tr44/#IndicPositionalCategory.txt * @stable ICU 63 */ UCHAR_INDIC_POSITIONAL_CATEGORY=0x1016, /** * Enumerated property Indic_Syllabic_Category. * New in Unicode 6.0 as provisional; informative since Unicode 8.0. * See http://www.unicode.org/reports/tr44/#IndicSyllabicCategory.txt * @stable ICU 63 */ UCHAR_INDIC_SYLLABIC_CATEGORY=0x1017, /** * Enumerated property Vertical_Orientation. * Used for UAX #50 Unicode Vertical Text Layout (https://www.unicode.org/reports/tr50/). * New as a UCD property in Unicode 10.0. * @stable ICU 63 */ UCHAR_VERTICAL_ORIENTATION=0x1018, #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** Bitmask property General_Category_Mask. This is the General_Category property returned as a bit mask. When used in u_getIntPropertyValue(c), same as U_MASK(u_charType(c)), returns bit masks for UCharCategory values where exactly one bit is set. When used with u_getPropertyValueName() and u_getPropertyValueEnum(), a multi-bit mask is used for sets of categories like "Letters". Mask values should be cast to uint32_t. @stable ICU 2.4 */ UCHAR_GENERAL_CATEGORY_MASK=0x2000, /** First constant for bit-mask Unicode properties. @stable ICU 2.4 */ UCHAR_MASK_START=UCHAR_GENERAL_CATEGORY_MASK, /** Double property Numeric_Value. Corresponds to u_getNumericValue. @stable ICU 2.4 */ UCHAR_NUMERIC_VALUE=0x3000, /** First constant for double Unicode properties. @stable ICU 2.4 */ UCHAR_DOUBLE_START=UCHAR_NUMERIC_VALUE, /** String property Age. Corresponds to u_charAge. @stable ICU 2.4 */ UCHAR_AGE=0x4000, /** First constant for string Unicode properties. @stable ICU 2.4 */ UCHAR_STRING_START=UCHAR_AGE, /** String property Bidi_Mirroring_Glyph. Corresponds to u_charMirror. @stable ICU 2.4 */ UCHAR_BIDI_MIRRORING_GLYPH=0x4001, /** String property Case_Folding. Corresponds to u_strFoldCase in ustring.h. @stable ICU 2.4 */ UCHAR_CASE_FOLDING=0x4002, /** String property Lowercase_Mapping. Corresponds to u_strToLower in ustring.h. @stable ICU 2.4 */ UCHAR_LOWERCASE_MAPPING=0x4004, /** String property Name. Corresponds to u_charName. @stable ICU 2.4 */ UCHAR_NAME=0x4005, /** String property Simple_Case_Folding. Corresponds to u_foldCase. @stable ICU 2.4 */ UCHAR_SIMPLE_CASE_FOLDING=0x4006, /** String property Simple_Lowercase_Mapping. Corresponds to u_tolower. @stable ICU 2.4 */ UCHAR_SIMPLE_LOWERCASE_MAPPING=0x4007, /** String property Simple_Titlecase_Mapping. Corresponds to u_totitle. @stable ICU 2.4 */ UCHAR_SIMPLE_TITLECASE_MAPPING=0x4008, /** String property Simple_Uppercase_Mapping. Corresponds to u_toupper. @stable ICU 2.4 */ UCHAR_SIMPLE_UPPERCASE_MAPPING=0x4009, /** String property Titlecase_Mapping. Corresponds to u_strToTitle in ustring.h. @stable ICU 2.4 */ UCHAR_TITLECASE_MAPPING=0x400A, /** String property Uppercase_Mapping. Corresponds to u_strToUpper in ustring.h. @stable ICU 2.4 */ UCHAR_UPPERCASE_MAPPING=0x400C, /** String property Bidi_Paired_Bracket (new in Unicode 6.3). Corresponds to u_getBidiPairedBracket. @stable ICU 52 */ UCHAR_BIDI_PAIRED_BRACKET=0x400D, /** Miscellaneous property Script_Extensions (new in Unicode 6.0). Some characters are commonly used in multiple scripts. For more information, see UAX #24: http://www.unicode.org/reports/tr24/. Corresponds to uscript_hasScript and uscript_getScriptExtensions in uscript.h. @stable ICU 4.6 */ UCHAR_SCRIPT_EXTENSIONS=0x7000, /** First constant for Unicode properties with unusual value types. @stable ICU 4.6 */ UCHAR_OTHER_PROPERTY_START=UCHAR_SCRIPT_EXTENSIONS, /** Represents a nonexistent or invalid property or property value. @stable ICU 2.4 */ UCHAR_INVALID_CODE = -1 } UProperty; /** * Data for enumerated Unicode general category types. * See http://www.unicode.org/Public/UNIDATA/UnicodeData.html . * @stable ICU 2.0 */ typedef enum UCharCategory { /* * Note: UCharCategory constants and their API comments are parsed by preparseucd.py. * It matches pairs of lines like * / ** comment... * / * U_<[A-Z_]+> = , */ /** Non-category for unassigned and non-character code points. @stable ICU 2.0 */ U_UNASSIGNED = 0, /** Cn "Other, Not Assigned (no characters in [UnicodeData.txt] have this property)" (same as U_UNASSIGNED!) @stable ICU 2.0 */ U_GENERAL_OTHER_TYPES = 0, /** Lu @stable ICU 2.0 */ U_UPPERCASE_LETTER = 1, /** Ll @stable ICU 2.0 */ U_LOWERCASE_LETTER = 2, /** Lt @stable ICU 2.0 */ U_TITLECASE_LETTER = 3, /** Lm @stable ICU 2.0 */ U_MODIFIER_LETTER = 4, /** Lo @stable ICU 2.0 */ U_OTHER_LETTER = 5, /** Mn @stable ICU 2.0 */ U_NON_SPACING_MARK = 6, /** Me @stable ICU 2.0 */ U_ENCLOSING_MARK = 7, /** Mc @stable ICU 2.0 */ U_COMBINING_SPACING_MARK = 8, /** Nd @stable ICU 2.0 */ U_DECIMAL_DIGIT_NUMBER = 9, /** Nl @stable ICU 2.0 */ U_LETTER_NUMBER = 10, /** No @stable ICU 2.0 */ U_OTHER_NUMBER = 11, /** Zs @stable ICU 2.0 */ U_SPACE_SEPARATOR = 12, /** Zl @stable ICU 2.0 */ U_LINE_SEPARATOR = 13, /** Zp @stable ICU 2.0 */ U_PARAGRAPH_SEPARATOR = 14, /** Cc @stable ICU 2.0 */ U_CONTROL_CHAR = 15, /** Cf @stable ICU 2.0 */ U_FORMAT_CHAR = 16, /** Co @stable ICU 2.0 */ U_PRIVATE_USE_CHAR = 17, /** Cs @stable ICU 2.0 */ U_SURROGATE = 18, /** Pd @stable ICU 2.0 */ U_DASH_PUNCTUATION = 19, /** Ps @stable ICU 2.0 */ U_START_PUNCTUATION = 20, /** Pe @stable ICU 2.0 */ U_END_PUNCTUATION = 21, /** Pc @stable ICU 2.0 */ U_CONNECTOR_PUNCTUATION = 22, /** Po @stable ICU 2.0 */ U_OTHER_PUNCTUATION = 23, /** Sm @stable ICU 2.0 */ U_MATH_SYMBOL = 24, /** Sc @stable ICU 2.0 */ U_CURRENCY_SYMBOL = 25, /** Sk @stable ICU 2.0 */ U_MODIFIER_SYMBOL = 26, /** So @stable ICU 2.0 */ U_OTHER_SYMBOL = 27, /** Pi @stable ICU 2.0 */ U_INITIAL_PUNCTUATION = 28, /** Pf @stable ICU 2.0 */ U_FINAL_PUNCTUATION = 29, /** * One higher than the last enum UCharCategory constant. * This numeric value is stable (will not change), see * http://www.unicode.org/policies/stability_policy.html#Property_Value * * @stable ICU 2.0 */ U_CHAR_CATEGORY_COUNT } UCharCategory; /** * U_GC_XX_MASK constants are bit flags corresponding to Unicode * general category values. * For each category, the nth bit is set if the numeric value of the * corresponding UCharCategory constant is n. * * There are also some U_GC_Y_MASK constants for groups of general categories * like L for all letter categories. * * @see u_charType * @see U_GET_GC_MASK * @see UCharCategory * @stable ICU 2.1 */ #define U_GC_CN_MASK U_MASK(U_GENERAL_OTHER_TYPES) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LU_MASK U_MASK(U_UPPERCASE_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LL_MASK U_MASK(U_LOWERCASE_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LT_MASK U_MASK(U_TITLECASE_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LM_MASK U_MASK(U_MODIFIER_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LO_MASK U_MASK(U_OTHER_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_MN_MASK U_MASK(U_NON_SPACING_MARK) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ME_MASK U_MASK(U_ENCLOSING_MARK) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_MC_MASK U_MASK(U_COMBINING_SPACING_MARK) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ND_MASK U_MASK(U_DECIMAL_DIGIT_NUMBER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_NL_MASK U_MASK(U_LETTER_NUMBER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_NO_MASK U_MASK(U_OTHER_NUMBER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZS_MASK U_MASK(U_SPACE_SEPARATOR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZL_MASK U_MASK(U_LINE_SEPARATOR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZP_MASK U_MASK(U_PARAGRAPH_SEPARATOR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CC_MASK U_MASK(U_CONTROL_CHAR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CF_MASK U_MASK(U_FORMAT_CHAR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CO_MASK U_MASK(U_PRIVATE_USE_CHAR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CS_MASK U_MASK(U_SURROGATE) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PD_MASK U_MASK(U_DASH_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PS_MASK U_MASK(U_START_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PE_MASK U_MASK(U_END_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PC_MASK U_MASK(U_CONNECTOR_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PO_MASK U_MASK(U_OTHER_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SM_MASK U_MASK(U_MATH_SYMBOL) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SC_MASK U_MASK(U_CURRENCY_SYMBOL) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SK_MASK U_MASK(U_MODIFIER_SYMBOL) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SO_MASK U_MASK(U_OTHER_SYMBOL) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PI_MASK U_MASK(U_INITIAL_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PF_MASK U_MASK(U_FINAL_PUNCTUATION) /** Mask constant for multiple UCharCategory bits (L Letters). @stable ICU 2.1 */ #define U_GC_L_MASK \ (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK|U_GC_LM_MASK|U_GC_LO_MASK) /** Mask constant for multiple UCharCategory bits (LC Cased Letters). @stable ICU 2.1 */ #define U_GC_LC_MASK \ (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK) /** Mask constant for multiple UCharCategory bits (M Marks). @stable ICU 2.1 */ #define U_GC_M_MASK (U_GC_MN_MASK|U_GC_ME_MASK|U_GC_MC_MASK) /** Mask constant for multiple UCharCategory bits (N Numbers). @stable ICU 2.1 */ #define U_GC_N_MASK (U_GC_ND_MASK|U_GC_NL_MASK|U_GC_NO_MASK) /** Mask constant for multiple UCharCategory bits (Z Separators). @stable ICU 2.1 */ #define U_GC_Z_MASK (U_GC_ZS_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK) /** Mask constant for multiple UCharCategory bits (C Others). @stable ICU 2.1 */ #define U_GC_C_MASK \ (U_GC_CN_MASK|U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CO_MASK|U_GC_CS_MASK) /** Mask constant for multiple UCharCategory bits (P Punctuation). @stable ICU 2.1 */ #define U_GC_P_MASK \ (U_GC_PD_MASK|U_GC_PS_MASK|U_GC_PE_MASK|U_GC_PC_MASK|U_GC_PO_MASK| \ U_GC_PI_MASK|U_GC_PF_MASK) /** Mask constant for multiple UCharCategory bits (S Symbols). @stable ICU 2.1 */ #define U_GC_S_MASK (U_GC_SM_MASK|U_GC_SC_MASK|U_GC_SK_MASK|U_GC_SO_MASK) /** * This specifies the language directional property of a character set. * @stable ICU 2.0 */ typedef enum UCharDirection { /* * Note: UCharDirection constants and their API comments are parsed by preparseucd.py. * It matches pairs of lines like * / ** comment... * / * U_<[A-Z_]+> = , */ /** L @stable ICU 2.0 */ U_LEFT_TO_RIGHT = 0, /** R @stable ICU 2.0 */ U_RIGHT_TO_LEFT = 1, /** EN @stable ICU 2.0 */ U_EUROPEAN_NUMBER = 2, /** ES @stable ICU 2.0 */ U_EUROPEAN_NUMBER_SEPARATOR = 3, /** ET @stable ICU 2.0 */ U_EUROPEAN_NUMBER_TERMINATOR = 4, /** AN @stable ICU 2.0 */ U_ARABIC_NUMBER = 5, /** CS @stable ICU 2.0 */ U_COMMON_NUMBER_SEPARATOR = 6, /** B @stable ICU 2.0 */ U_BLOCK_SEPARATOR = 7, /** S @stable ICU 2.0 */ U_SEGMENT_SEPARATOR = 8, /** WS @stable ICU 2.0 */ U_WHITE_SPACE_NEUTRAL = 9, /** ON @stable ICU 2.0 */ U_OTHER_NEUTRAL = 10, /** LRE @stable ICU 2.0 */ U_LEFT_TO_RIGHT_EMBEDDING = 11, /** LRO @stable ICU 2.0 */ U_LEFT_TO_RIGHT_OVERRIDE = 12, /** AL @stable ICU 2.0 */ U_RIGHT_TO_LEFT_ARABIC = 13, /** RLE @stable ICU 2.0 */ U_RIGHT_TO_LEFT_EMBEDDING = 14, /** RLO @stable ICU 2.0 */ U_RIGHT_TO_LEFT_OVERRIDE = 15, /** PDF @stable ICU 2.0 */ U_POP_DIRECTIONAL_FORMAT = 16, /** NSM @stable ICU 2.0 */ U_DIR_NON_SPACING_MARK = 17, /** BN @stable ICU 2.0 */ U_BOUNDARY_NEUTRAL = 18, /** FSI @stable ICU 52 */ U_FIRST_STRONG_ISOLATE = 19, /** LRI @stable ICU 52 */ U_LEFT_TO_RIGHT_ISOLATE = 20, /** RLI @stable ICU 52 */ U_RIGHT_TO_LEFT_ISOLATE = 21, /** PDI @stable ICU 52 */ U_POP_DIRECTIONAL_ISOLATE = 22, } UCharDirection; /** * Bidi Paired Bracket Type constants. * * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE * @stable ICU 52 */ typedef enum UBidiPairedBracketType { /* * Note: UBidiPairedBracketType constants are parsed by preparseucd.py. * It matches lines like * U_BPT_ */ /** Not a paired bracket. @stable ICU 52 */ U_BPT_NONE, /** Open paired bracket. @stable ICU 52 */ U_BPT_OPEN, /** Close paired bracket. @stable ICU 52 */ U_BPT_CLOSE, } UBidiPairedBracketType; /** * Constants for Unicode blocks, see the Unicode Data file Blocks.txt * @stable ICU 2.0 */ enum UBlockCode { /* * Note: UBlockCode constants are parsed by preparseucd.py. * It matches lines like * UBLOCK_ = , */ /** New No_Block value in Unicode 4. @stable ICU 2.6 */ UBLOCK_NO_BLOCK = 0, /*[none]*/ /* Special range indicating No_Block */ /** @stable ICU 2.0 */ UBLOCK_BASIC_LATIN = 1, /*[0000]*/ /** @stable ICU 2.0 */ UBLOCK_LATIN_1_SUPPLEMENT=2, /*[0080]*/ /** @stable ICU 2.0 */ UBLOCK_LATIN_EXTENDED_A =3, /*[0100]*/ /** @stable ICU 2.0 */ UBLOCK_LATIN_EXTENDED_B =4, /*[0180]*/ /** @stable ICU 2.0 */ UBLOCK_IPA_EXTENSIONS =5, /*[0250]*/ /** @stable ICU 2.0 */ UBLOCK_SPACING_MODIFIER_LETTERS =6, /*[02B0]*/ /** @stable ICU 2.0 */ UBLOCK_COMBINING_DIACRITICAL_MARKS =7, /*[0300]*/ /** * Unicode 3.2 renames this block to "Greek and Coptic". * @stable ICU 2.0 */ UBLOCK_GREEK =8, /*[0370]*/ /** @stable ICU 2.0 */ UBLOCK_CYRILLIC =9, /*[0400]*/ /** @stable ICU 2.0 */ UBLOCK_ARMENIAN =10, /*[0530]*/ /** @stable ICU 2.0 */ UBLOCK_HEBREW =11, /*[0590]*/ /** @stable ICU 2.0 */ UBLOCK_ARABIC =12, /*[0600]*/ /** @stable ICU 2.0 */ UBLOCK_SYRIAC =13, /*[0700]*/ /** @stable ICU 2.0 */ UBLOCK_THAANA =14, /*[0780]*/ /** @stable ICU 2.0 */ UBLOCK_DEVANAGARI =15, /*[0900]*/ /** @stable ICU 2.0 */ UBLOCK_BENGALI =16, /*[0980]*/ /** @stable ICU 2.0 */ UBLOCK_GURMUKHI =17, /*[0A00]*/ /** @stable ICU 2.0 */ UBLOCK_GUJARATI =18, /*[0A80]*/ /** @stable ICU 2.0 */ UBLOCK_ORIYA =19, /*[0B00]*/ /** @stable ICU 2.0 */ UBLOCK_TAMIL =20, /*[0B80]*/ /** @stable ICU 2.0 */ UBLOCK_TELUGU =21, /*[0C00]*/ /** @stable ICU 2.0 */ UBLOCK_KANNADA =22, /*[0C80]*/ /** @stable ICU 2.0 */ UBLOCK_MALAYALAM =23, /*[0D00]*/ /** @stable ICU 2.0 */ UBLOCK_SINHALA =24, /*[0D80]*/ /** @stable ICU 2.0 */ UBLOCK_THAI =25, /*[0E00]*/ /** @stable ICU 2.0 */ UBLOCK_LAO =26, /*[0E80]*/ /** @stable ICU 2.0 */ UBLOCK_TIBETAN =27, /*[0F00]*/ /** @stable ICU 2.0 */ UBLOCK_MYANMAR =28, /*[1000]*/ /** @stable ICU 2.0 */ UBLOCK_GEORGIAN =29, /*[10A0]*/ /** @stable ICU 2.0 */ UBLOCK_HANGUL_JAMO =30, /*[1100]*/ /** @stable ICU 2.0 */ UBLOCK_ETHIOPIC =31, /*[1200]*/ /** @stable ICU 2.0 */ UBLOCK_CHEROKEE =32, /*[13A0]*/ /** @stable ICU 2.0 */ UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS =33, /*[1400]*/ /** @stable ICU 2.0 */ UBLOCK_OGHAM =34, /*[1680]*/ /** @stable ICU 2.0 */ UBLOCK_RUNIC =35, /*[16A0]*/ /** @stable ICU 2.0 */ UBLOCK_KHMER =36, /*[1780]*/ /** @stable ICU 2.0 */ UBLOCK_MONGOLIAN =37, /*[1800]*/ /** @stable ICU 2.0 */ UBLOCK_LATIN_EXTENDED_ADDITIONAL =38, /*[1E00]*/ /** @stable ICU 2.0 */ UBLOCK_GREEK_EXTENDED =39, /*[1F00]*/ /** @stable ICU 2.0 */ UBLOCK_GENERAL_PUNCTUATION =40, /*[2000]*/ /** @stable ICU 2.0 */ UBLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS =41, /*[2070]*/ /** @stable ICU 2.0 */ UBLOCK_CURRENCY_SYMBOLS =42, /*[20A0]*/ /** * Unicode 3.2 renames this block to "Combining Diacritical Marks for Symbols". * @stable ICU 2.0 */ UBLOCK_COMBINING_MARKS_FOR_SYMBOLS =43, /*[20D0]*/ /** @stable ICU 2.0 */ UBLOCK_LETTERLIKE_SYMBOLS =44, /*[2100]*/ /** @stable ICU 2.0 */ UBLOCK_NUMBER_FORMS =45, /*[2150]*/ /** @stable ICU 2.0 */ UBLOCK_ARROWS =46, /*[2190]*/ /** @stable ICU 2.0 */ UBLOCK_MATHEMATICAL_OPERATORS =47, /*[2200]*/ /** @stable ICU 2.0 */ UBLOCK_MISCELLANEOUS_TECHNICAL =48, /*[2300]*/ /** @stable ICU 2.0 */ UBLOCK_CONTROL_PICTURES =49, /*[2400]*/ /** @stable ICU 2.0 */ UBLOCK_OPTICAL_CHARACTER_RECOGNITION =50, /*[2440]*/ /** @stable ICU 2.0 */ UBLOCK_ENCLOSED_ALPHANUMERICS =51, /*[2460]*/ /** @stable ICU 2.0 */ UBLOCK_BOX_DRAWING =52, /*[2500]*/ /** @stable ICU 2.0 */ UBLOCK_BLOCK_ELEMENTS =53, /*[2580]*/ /** @stable ICU 2.0 */ UBLOCK_GEOMETRIC_SHAPES =54, /*[25A0]*/ /** @stable ICU 2.0 */ UBLOCK_MISCELLANEOUS_SYMBOLS =55, /*[2600]*/ /** @stable ICU 2.0 */ UBLOCK_DINGBATS =56, /*[2700]*/ /** @stable ICU 2.0 */ UBLOCK_BRAILLE_PATTERNS =57, /*[2800]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_RADICALS_SUPPLEMENT =58, /*[2E80]*/ /** @stable ICU 2.0 */ UBLOCK_KANGXI_RADICALS =59, /*[2F00]*/ /** @stable ICU 2.0 */ UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS =60, /*[2FF0]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION =61, /*[3000]*/ /** @stable ICU 2.0 */ UBLOCK_HIRAGANA =62, /*[3040]*/ /** @stable ICU 2.0 */ UBLOCK_KATAKANA =63, /*[30A0]*/ /** @stable ICU 2.0 */ UBLOCK_BOPOMOFO =64, /*[3100]*/ /** @stable ICU 2.0 */ UBLOCK_HANGUL_COMPATIBILITY_JAMO =65, /*[3130]*/ /** @stable ICU 2.0 */ UBLOCK_KANBUN =66, /*[3190]*/ /** @stable ICU 2.0 */ UBLOCK_BOPOMOFO_EXTENDED =67, /*[31A0]*/ /** @stable ICU 2.0 */ UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS =68, /*[3200]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_COMPATIBILITY =69, /*[3300]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A =70, /*[3400]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS =71, /*[4E00]*/ /** @stable ICU 2.0 */ UBLOCK_YI_SYLLABLES =72, /*[A000]*/ /** @stable ICU 2.0 */ UBLOCK_YI_RADICALS =73, /*[A490]*/ /** @stable ICU 2.0 */ UBLOCK_HANGUL_SYLLABLES =74, /*[AC00]*/ /** @stable ICU 2.0 */ UBLOCK_HIGH_SURROGATES =75, /*[D800]*/ /** @stable ICU 2.0 */ UBLOCK_HIGH_PRIVATE_USE_SURROGATES =76, /*[DB80]*/ /** @stable ICU 2.0 */ UBLOCK_LOW_SURROGATES =77, /*[DC00]*/ /** * Same as UBLOCK_PRIVATE_USE. * Until Unicode 3.1.1, the corresponding block name was "Private Use", * and multiple code point ranges had this block. * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area" and * adds separate blocks for the supplementary PUAs. * * @stable ICU 2.0 */ UBLOCK_PRIVATE_USE_AREA =78, /*[E000]*/ /** * Same as UBLOCK_PRIVATE_USE_AREA. * Until Unicode 3.1.1, the corresponding block name was "Private Use", * and multiple code point ranges had this block. * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area" and * adds separate blocks for the supplementary PUAs. * * @stable ICU 2.0 */ UBLOCK_PRIVATE_USE = UBLOCK_PRIVATE_USE_AREA, /** @stable ICU 2.0 */ UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS =79, /*[F900]*/ /** @stable ICU 2.0 */ UBLOCK_ALPHABETIC_PRESENTATION_FORMS =80, /*[FB00]*/ /** @stable ICU 2.0 */ UBLOCK_ARABIC_PRESENTATION_FORMS_A =81, /*[FB50]*/ /** @stable ICU 2.0 */ UBLOCK_COMBINING_HALF_MARKS =82, /*[FE20]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_COMPATIBILITY_FORMS =83, /*[FE30]*/ /** @stable ICU 2.0 */ UBLOCK_SMALL_FORM_VARIANTS =84, /*[FE50]*/ /** @stable ICU 2.0 */ UBLOCK_ARABIC_PRESENTATION_FORMS_B =85, /*[FE70]*/ /** @stable ICU 2.0 */ UBLOCK_SPECIALS =86, /*[FFF0]*/ /** @stable ICU 2.0 */ UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS =87, /*[FF00]*/ /* New blocks in Unicode 3.1 */ /** @stable ICU 2.0 */ UBLOCK_OLD_ITALIC = 88, /*[10300]*/ /** @stable ICU 2.0 */ UBLOCK_GOTHIC = 89, /*[10330]*/ /** @stable ICU 2.0 */ UBLOCK_DESERET = 90, /*[10400]*/ /** @stable ICU 2.0 */ UBLOCK_BYZANTINE_MUSICAL_SYMBOLS = 91, /*[1D000]*/ /** @stable ICU 2.0 */ UBLOCK_MUSICAL_SYMBOLS = 92, /*[1D100]*/ /** @stable ICU 2.0 */ UBLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, /*[1D400]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, /*[20000]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, /*[2F800]*/ /** @stable ICU 2.0 */ UBLOCK_TAGS = 96, /*[E0000]*/ /* New blocks in Unicode 3.2 */ /** @stable ICU 3.0 */ UBLOCK_CYRILLIC_SUPPLEMENT = 97, /*[0500]*/ /** * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement". * @stable ICU 2.2 */ UBLOCK_CYRILLIC_SUPPLEMENTARY = UBLOCK_CYRILLIC_SUPPLEMENT, /** @stable ICU 2.2 */ UBLOCK_TAGALOG = 98, /*[1700]*/ /** @stable ICU 2.2 */ UBLOCK_HANUNOO = 99, /*[1720]*/ /** @stable ICU 2.2 */ UBLOCK_BUHID = 100, /*[1740]*/ /** @stable ICU 2.2 */ UBLOCK_TAGBANWA = 101, /*[1760]*/ /** @stable ICU 2.2 */ UBLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, /*[27C0]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTAL_ARROWS_A = 103, /*[27F0]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTAL_ARROWS_B = 104, /*[2900]*/ /** @stable ICU 2.2 */ UBLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105, /*[2980]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, /*[2A00]*/ /** @stable ICU 2.2 */ UBLOCK_KATAKANA_PHONETIC_EXTENSIONS = 107, /*[31F0]*/ /** @stable ICU 2.2 */ UBLOCK_VARIATION_SELECTORS = 108, /*[FE00]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109, /*[F0000]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, /*[100000]*/ /* New blocks in Unicode 4 */ /** @stable ICU 2.6 */ UBLOCK_LIMBU = 111, /*[1900]*/ /** @stable ICU 2.6 */ UBLOCK_TAI_LE = 112, /*[1950]*/ /** @stable ICU 2.6 */ UBLOCK_KHMER_SYMBOLS = 113, /*[19E0]*/ /** @stable ICU 2.6 */ UBLOCK_PHONETIC_EXTENSIONS = 114, /*[1D00]*/ /** @stable ICU 2.6 */ UBLOCK_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, /*[2B00]*/ /** @stable ICU 2.6 */ UBLOCK_YIJING_HEXAGRAM_SYMBOLS = 116, /*[4DC0]*/ /** @stable ICU 2.6 */ UBLOCK_LINEAR_B_SYLLABARY = 117, /*[10000]*/ /** @stable ICU 2.6 */ UBLOCK_LINEAR_B_IDEOGRAMS = 118, /*[10080]*/ /** @stable ICU 2.6 */ UBLOCK_AEGEAN_NUMBERS = 119, /*[10100]*/ /** @stable ICU 2.6 */ UBLOCK_UGARITIC = 120, /*[10380]*/ /** @stable ICU 2.6 */ UBLOCK_SHAVIAN = 121, /*[10450]*/ /** @stable ICU 2.6 */ UBLOCK_OSMANYA = 122, /*[10480]*/ /** @stable ICU 2.6 */ UBLOCK_CYPRIOT_SYLLABARY = 123, /*[10800]*/ /** @stable ICU 2.6 */ UBLOCK_TAI_XUAN_JING_SYMBOLS = 124, /*[1D300]*/ /** @stable ICU 2.6 */ UBLOCK_VARIATION_SELECTORS_SUPPLEMENT = 125, /*[E0100]*/ /* New blocks in Unicode 4.1 */ /** @stable ICU 3.4 */ UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION = 126, /*[1D200]*/ /** @stable ICU 3.4 */ UBLOCK_ANCIENT_GREEK_NUMBERS = 127, /*[10140]*/ /** @stable ICU 3.4 */ UBLOCK_ARABIC_SUPPLEMENT = 128, /*[0750]*/ /** @stable ICU 3.4 */ UBLOCK_BUGINESE = 129, /*[1A00]*/ /** @stable ICU 3.4 */ UBLOCK_CJK_STROKES = 130, /*[31C0]*/ /** @stable ICU 3.4 */ UBLOCK_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, /*[1DC0]*/ /** @stable ICU 3.4 */ UBLOCK_COPTIC = 132, /*[2C80]*/ /** @stable ICU 3.4 */ UBLOCK_ETHIOPIC_EXTENDED = 133, /*[2D80]*/ /** @stable ICU 3.4 */ UBLOCK_ETHIOPIC_SUPPLEMENT = 134, /*[1380]*/ /** @stable ICU 3.4 */ UBLOCK_GEORGIAN_SUPPLEMENT = 135, /*[2D00]*/ /** @stable ICU 3.4 */ UBLOCK_GLAGOLITIC = 136, /*[2C00]*/ /** @stable ICU 3.4 */ UBLOCK_KHAROSHTHI = 137, /*[10A00]*/ /** @stable ICU 3.4 */ UBLOCK_MODIFIER_TONE_LETTERS = 138, /*[A700]*/ /** @stable ICU 3.4 */ UBLOCK_NEW_TAI_LUE = 139, /*[1980]*/ /** @stable ICU 3.4 */ UBLOCK_OLD_PERSIAN = 140, /*[103A0]*/ /** @stable ICU 3.4 */ UBLOCK_PHONETIC_EXTENSIONS_SUPPLEMENT = 141, /*[1D80]*/ /** @stable ICU 3.4 */ UBLOCK_SUPPLEMENTAL_PUNCTUATION = 142, /*[2E00]*/ /** @stable ICU 3.4 */ UBLOCK_SYLOTI_NAGRI = 143, /*[A800]*/ /** @stable ICU 3.4 */ UBLOCK_TIFINAGH = 144, /*[2D30]*/ /** @stable ICU 3.4 */ UBLOCK_VERTICAL_FORMS = 145, /*[FE10]*/ /* New blocks in Unicode 5.0 */ /** @stable ICU 3.6 */ UBLOCK_NKO = 146, /*[07C0]*/ /** @stable ICU 3.6 */ UBLOCK_BALINESE = 147, /*[1B00]*/ /** @stable ICU 3.6 */ UBLOCK_LATIN_EXTENDED_C = 148, /*[2C60]*/ /** @stable ICU 3.6 */ UBLOCK_LATIN_EXTENDED_D = 149, /*[A720]*/ /** @stable ICU 3.6 */ UBLOCK_PHAGS_PA = 150, /*[A840]*/ /** @stable ICU 3.6 */ UBLOCK_PHOENICIAN = 151, /*[10900]*/ /** @stable ICU 3.6 */ UBLOCK_CUNEIFORM = 152, /*[12000]*/ /** @stable ICU 3.6 */ UBLOCK_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153, /*[12400]*/ /** @stable ICU 3.6 */ UBLOCK_COUNTING_ROD_NUMERALS = 154, /*[1D360]*/ /* New blocks in Unicode 5.1 */ /** @stable ICU 4.0 */ UBLOCK_SUNDANESE = 155, /*[1B80]*/ /** @stable ICU 4.0 */ UBLOCK_LEPCHA = 156, /*[1C00]*/ /** @stable ICU 4.0 */ UBLOCK_OL_CHIKI = 157, /*[1C50]*/ /** @stable ICU 4.0 */ UBLOCK_CYRILLIC_EXTENDED_A = 158, /*[2DE0]*/ /** @stable ICU 4.0 */ UBLOCK_VAI = 159, /*[A500]*/ /** @stable ICU 4.0 */ UBLOCK_CYRILLIC_EXTENDED_B = 160, /*[A640]*/ /** @stable ICU 4.0 */ UBLOCK_SAURASHTRA = 161, /*[A880]*/ /** @stable ICU 4.0 */ UBLOCK_KAYAH_LI = 162, /*[A900]*/ /** @stable ICU 4.0 */ UBLOCK_REJANG = 163, /*[A930]*/ /** @stable ICU 4.0 */ UBLOCK_CHAM = 164, /*[AA00]*/ /** @stable ICU 4.0 */ UBLOCK_ANCIENT_SYMBOLS = 165, /*[10190]*/ /** @stable ICU 4.0 */ UBLOCK_PHAISTOS_DISC = 166, /*[101D0]*/ /** @stable ICU 4.0 */ UBLOCK_LYCIAN = 167, /*[10280]*/ /** @stable ICU 4.0 */ UBLOCK_CARIAN = 168, /*[102A0]*/ /** @stable ICU 4.0 */ UBLOCK_LYDIAN = 169, /*[10920]*/ /** @stable ICU 4.0 */ UBLOCK_MAHJONG_TILES = 170, /*[1F000]*/ /** @stable ICU 4.0 */ UBLOCK_DOMINO_TILES = 171, /*[1F030]*/ /* New blocks in Unicode 5.2 */ /** @stable ICU 4.4 */ UBLOCK_SAMARITAN = 172, /*[0800]*/ /** @stable ICU 4.4 */ UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, /*[18B0]*/ /** @stable ICU 4.4 */ UBLOCK_TAI_THAM = 174, /*[1A20]*/ /** @stable ICU 4.4 */ UBLOCK_VEDIC_EXTENSIONS = 175, /*[1CD0]*/ /** @stable ICU 4.4 */ UBLOCK_LISU = 176, /*[A4D0]*/ /** @stable ICU 4.4 */ UBLOCK_BAMUM = 177, /*[A6A0]*/ /** @stable ICU 4.4 */ UBLOCK_COMMON_INDIC_NUMBER_FORMS = 178, /*[A830]*/ /** @stable ICU 4.4 */ UBLOCK_DEVANAGARI_EXTENDED = 179, /*[A8E0]*/ /** @stable ICU 4.4 */ UBLOCK_HANGUL_JAMO_EXTENDED_A = 180, /*[A960]*/ /** @stable ICU 4.4 */ UBLOCK_JAVANESE = 181, /*[A980]*/ /** @stable ICU 4.4 */ UBLOCK_MYANMAR_EXTENDED_A = 182, /*[AA60]*/ /** @stable ICU 4.4 */ UBLOCK_TAI_VIET = 183, /*[AA80]*/ /** @stable ICU 4.4 */ UBLOCK_MEETEI_MAYEK = 184, /*[ABC0]*/ /** @stable ICU 4.4 */ UBLOCK_HANGUL_JAMO_EXTENDED_B = 185, /*[D7B0]*/ /** @stable ICU 4.4 */ UBLOCK_IMPERIAL_ARAMAIC = 186, /*[10840]*/ /** @stable ICU 4.4 */ UBLOCK_OLD_SOUTH_ARABIAN = 187, /*[10A60]*/ /** @stable ICU 4.4 */ UBLOCK_AVESTAN = 188, /*[10B00]*/ /** @stable ICU 4.4 */ UBLOCK_INSCRIPTIONAL_PARTHIAN = 189, /*[10B40]*/ /** @stable ICU 4.4 */ UBLOCK_INSCRIPTIONAL_PAHLAVI = 190, /*[10B60]*/ /** @stable ICU 4.4 */ UBLOCK_OLD_TURKIC = 191, /*[10C00]*/ /** @stable ICU 4.4 */ UBLOCK_RUMI_NUMERAL_SYMBOLS = 192, /*[10E60]*/ /** @stable ICU 4.4 */ UBLOCK_KAITHI = 193, /*[11080]*/ /** @stable ICU 4.4 */ UBLOCK_EGYPTIAN_HIEROGLYPHS = 194, /*[13000]*/ /** @stable ICU 4.4 */ UBLOCK_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, /*[1F100]*/ /** @stable ICU 4.4 */ UBLOCK_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, /*[1F200]*/ /** @stable ICU 4.4 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, /*[2A700]*/ /* New blocks in Unicode 6.0 */ /** @stable ICU 4.6 */ UBLOCK_MANDAIC = 198, /*[0840]*/ /** @stable ICU 4.6 */ UBLOCK_BATAK = 199, /*[1BC0]*/ /** @stable ICU 4.6 */ UBLOCK_ETHIOPIC_EXTENDED_A = 200, /*[AB00]*/ /** @stable ICU 4.6 */ UBLOCK_BRAHMI = 201, /*[11000]*/ /** @stable ICU 4.6 */ UBLOCK_BAMUM_SUPPLEMENT = 202, /*[16800]*/ /** @stable ICU 4.6 */ UBLOCK_KANA_SUPPLEMENT = 203, /*[1B000]*/ /** @stable ICU 4.6 */ UBLOCK_PLAYING_CARDS = 204, /*[1F0A0]*/ /** @stable ICU 4.6 */ UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, /*[1F300]*/ /** @stable ICU 4.6 */ UBLOCK_EMOTICONS = 206, /*[1F600]*/ /** @stable ICU 4.6 */ UBLOCK_TRANSPORT_AND_MAP_SYMBOLS = 207, /*[1F680]*/ /** @stable ICU 4.6 */ UBLOCK_ALCHEMICAL_SYMBOLS = 208, /*[1F700]*/ /** @stable ICU 4.6 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, /*[2B740]*/ /* New blocks in Unicode 6.1 */ /** @stable ICU 49 */ UBLOCK_ARABIC_EXTENDED_A = 210, /*[08A0]*/ /** @stable ICU 49 */ UBLOCK_ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS = 211, /*[1EE00]*/ /** @stable ICU 49 */ UBLOCK_CHAKMA = 212, /*[11100]*/ /** @stable ICU 49 */ UBLOCK_MEETEI_MAYEK_EXTENSIONS = 213, /*[AAE0]*/ /** @stable ICU 49 */ UBLOCK_MEROITIC_CURSIVE = 214, /*[109A0]*/ /** @stable ICU 49 */ UBLOCK_MEROITIC_HIEROGLYPHS = 215, /*[10980]*/ /** @stable ICU 49 */ UBLOCK_MIAO = 216, /*[16F00]*/ /** @stable ICU 49 */ UBLOCK_SHARADA = 217, /*[11180]*/ /** @stable ICU 49 */ UBLOCK_SORA_SOMPENG = 218, /*[110D0]*/ /** @stable ICU 49 */ UBLOCK_SUNDANESE_SUPPLEMENT = 219, /*[1CC0]*/ /** @stable ICU 49 */ UBLOCK_TAKRI = 220, /*[11680]*/ /* New blocks in Unicode 7.0 */ /** @stable ICU 54 */ UBLOCK_BASSA_VAH = 221, /*[16AD0]*/ /** @stable ICU 54 */ UBLOCK_CAUCASIAN_ALBANIAN = 222, /*[10530]*/ /** @stable ICU 54 */ UBLOCK_COPTIC_EPACT_NUMBERS = 223, /*[102E0]*/ /** @stable ICU 54 */ UBLOCK_COMBINING_DIACRITICAL_MARKS_EXTENDED = 224, /*[1AB0]*/ /** @stable ICU 54 */ UBLOCK_DUPLOYAN = 225, /*[1BC00]*/ /** @stable ICU 54 */ UBLOCK_ELBASAN = 226, /*[10500]*/ /** @stable ICU 54 */ UBLOCK_GEOMETRIC_SHAPES_EXTENDED = 227, /*[1F780]*/ /** @stable ICU 54 */ UBLOCK_GRANTHA = 228, /*[11300]*/ /** @stable ICU 54 */ UBLOCK_KHOJKI = 229, /*[11200]*/ /** @stable ICU 54 */ UBLOCK_KHUDAWADI = 230, /*[112B0]*/ /** @stable ICU 54 */ UBLOCK_LATIN_EXTENDED_E = 231, /*[AB30]*/ /** @stable ICU 54 */ UBLOCK_LINEAR_A = 232, /*[10600]*/ /** @stable ICU 54 */ UBLOCK_MAHAJANI = 233, /*[11150]*/ /** @stable ICU 54 */ UBLOCK_MANICHAEAN = 234, /*[10AC0]*/ /** @stable ICU 54 */ UBLOCK_MENDE_KIKAKUI = 235, /*[1E800]*/ /** @stable ICU 54 */ UBLOCK_MODI = 236, /*[11600]*/ /** @stable ICU 54 */ UBLOCK_MRO = 237, /*[16A40]*/ /** @stable ICU 54 */ UBLOCK_MYANMAR_EXTENDED_B = 238, /*[A9E0]*/ /** @stable ICU 54 */ UBLOCK_NABATAEAN = 239, /*[10880]*/ /** @stable ICU 54 */ UBLOCK_OLD_NORTH_ARABIAN = 240, /*[10A80]*/ /** @stable ICU 54 */ UBLOCK_OLD_PERMIC = 241, /*[10350]*/ /** @stable ICU 54 */ UBLOCK_ORNAMENTAL_DINGBATS = 242, /*[1F650]*/ /** @stable ICU 54 */ UBLOCK_PAHAWH_HMONG = 243, /*[16B00]*/ /** @stable ICU 54 */ UBLOCK_PALMYRENE = 244, /*[10860]*/ /** @stable ICU 54 */ UBLOCK_PAU_CIN_HAU = 245, /*[11AC0]*/ /** @stable ICU 54 */ UBLOCK_PSALTER_PAHLAVI = 246, /*[10B80]*/ /** @stable ICU 54 */ UBLOCK_SHORTHAND_FORMAT_CONTROLS = 247, /*[1BCA0]*/ /** @stable ICU 54 */ UBLOCK_SIDDHAM = 248, /*[11580]*/ /** @stable ICU 54 */ UBLOCK_SINHALA_ARCHAIC_NUMBERS = 249, /*[111E0]*/ /** @stable ICU 54 */ UBLOCK_SUPPLEMENTAL_ARROWS_C = 250, /*[1F800]*/ /** @stable ICU 54 */ UBLOCK_TIRHUTA = 251, /*[11480]*/ /** @stable ICU 54 */ UBLOCK_WARANG_CITI = 252, /*[118A0]*/ /* New blocks in Unicode 8.0 */ /** @stable ICU 56 */ UBLOCK_AHOM = 253, /*[11700]*/ /** @stable ICU 56 */ UBLOCK_ANATOLIAN_HIEROGLYPHS = 254, /*[14400]*/ /** @stable ICU 56 */ UBLOCK_CHEROKEE_SUPPLEMENT = 255, /*[AB70]*/ /** @stable ICU 56 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E = 256, /*[2B820]*/ /** @stable ICU 56 */ UBLOCK_EARLY_DYNASTIC_CUNEIFORM = 257, /*[12480]*/ /** @stable ICU 56 */ UBLOCK_HATRAN = 258, /*[108E0]*/ /** @stable ICU 56 */ UBLOCK_MULTANI = 259, /*[11280]*/ /** @stable ICU 56 */ UBLOCK_OLD_HUNGARIAN = 260, /*[10C80]*/ /** @stable ICU 56 */ UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS = 261, /*[1F900]*/ /** @stable ICU 56 */ UBLOCK_SUTTON_SIGNWRITING = 262, /*[1D800]*/ /* New blocks in Unicode 9.0 */ /** @stable ICU 58 */ UBLOCK_ADLAM = 263, /*[1E900]*/ /** @stable ICU 58 */ UBLOCK_BHAIKSUKI = 264, /*[11C00]*/ /** @stable ICU 58 */ UBLOCK_CYRILLIC_EXTENDED_C = 265, /*[1C80]*/ /** @stable ICU 58 */ UBLOCK_GLAGOLITIC_SUPPLEMENT = 266, /*[1E000]*/ /** @stable ICU 58 */ UBLOCK_IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION = 267, /*[16FE0]*/ /** @stable ICU 58 */ UBLOCK_MARCHEN = 268, /*[11C70]*/ /** @stable ICU 58 */ UBLOCK_MONGOLIAN_SUPPLEMENT = 269, /*[11660]*/ /** @stable ICU 58 */ UBLOCK_NEWA = 270, /*[11400]*/ /** @stable ICU 58 */ UBLOCK_OSAGE = 271, /*[104B0]*/ /** @stable ICU 58 */ UBLOCK_TANGUT = 272, /*[17000]*/ /** @stable ICU 58 */ UBLOCK_TANGUT_COMPONENTS = 273, /*[18800]*/ // New blocks in Unicode 10.0 /** @stable ICU 60 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F = 274, /*[2CEB0]*/ /** @stable ICU 60 */ UBLOCK_KANA_EXTENDED_A = 275, /*[1B100]*/ /** @stable ICU 60 */ UBLOCK_MASARAM_GONDI = 276, /*[11D00]*/ /** @stable ICU 60 */ UBLOCK_NUSHU = 277, /*[1B170]*/ /** @stable ICU 60 */ UBLOCK_SOYOMBO = 278, /*[11A50]*/ /** @stable ICU 60 */ UBLOCK_SYRIAC_SUPPLEMENT = 279, /*[0860]*/ /** @stable ICU 60 */ UBLOCK_ZANABAZAR_SQUARE = 280, /*[11A00]*/ #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) // New blocks in Unicode 11.0 /** @stable ICU 62 */ UBLOCK_CHESS_SYMBOLS = 281, /*[1FA00]*/ /** @stable ICU 62 */ UBLOCK_DOGRA = 282, /*[11800]*/ /** @stable ICU 62 */ UBLOCK_GEORGIAN_EXTENDED = 283, /*[1C90]*/ /** @stable ICU 62 */ UBLOCK_GUNJALA_GONDI = 284, /*[11D60]*/ /** @stable ICU 62 */ UBLOCK_HANIFI_ROHINGYA = 285, /*[10D00]*/ /** @stable ICU 62 */ UBLOCK_INDIC_SIYAQ_NUMBERS = 286, /*[1EC70]*/ /** @stable ICU 62 */ UBLOCK_MAKASAR = 287, /*[11EE0]*/ /** @stable ICU 62 */ UBLOCK_MAYAN_NUMERALS = 288, /*[1D2E0]*/ /** @stable ICU 62 */ UBLOCK_MEDEFAIDRIN = 289, /*[16E40]*/ /** @stable ICU 62 */ UBLOCK_OLD_SOGDIAN = 290, /*[10F00]*/ /** @stable ICU 62 */ UBLOCK_SOGDIAN = 291, /*[10F30]*/ #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) #if (NTDDI_VERSION >= NTDDI_WIN10_VB) // New blocks in Unicode 12.0 /** @stable ICU 64 */ UBLOCK_EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS = 292, /*[13430]*/ /** @stable ICU 64 */ UBLOCK_ELYMAIC = 293, /*[10FE0]*/ /** @stable ICU 64 */ UBLOCK_NANDINAGARI = 294, /*[119A0]*/ /** @stable ICU 64 */ UBLOCK_NYIAKENG_PUACHUE_HMONG = 295, /*[1E100]*/ /** @stable ICU 64 */ UBLOCK_OTTOMAN_SIYAQ_NUMBERS = 296, /*[1ED00]*/ /** @stable ICU 64 */ UBLOCK_SMALL_KANA_EXTENSION = 297, /*[1B130]*/ /** @stable ICU 64 */ UBLOCK_SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A = 298, /*[1FA70]*/ /** @stable ICU 64 */ UBLOCK_TAMIL_SUPPLEMENT = 299, /*[11FC0]*/ /** @stable ICU 64 */ UBLOCK_WANCHO = 300, /*[1E2C0]*/ #endif // (NTDDI_VERSION >= NTDDI_WIN10_VB) #if (NTDDI_VERSION >= NTDDI_WIN10_CO) // New blocks in Unicode 13.0 /** @stable ICU 66 */ UBLOCK_CHORASMIAN = 301, /*[10FB0]*/ /** @stable ICU 66 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G = 302, /*[30000]*/ /** @stable ICU 66 */ UBLOCK_DIVES_AKURU = 303, /*[11900]*/ /** @stable ICU 66 */ UBLOCK_KHITAN_SMALL_SCRIPT = 304, /*[18B00]*/ /** @stable ICU 66 */ UBLOCK_LISU_SUPPLEMENT = 305, /*[11FB0]*/ /** @stable ICU 66 */ UBLOCK_SYMBOLS_FOR_LEGACY_COMPUTING = 306, /*[1FB00]*/ /** @stable ICU 66 */ UBLOCK_TANGUT_SUPPLEMENT = 307, /*[18D00]*/ /** @stable ICU 66 */ UBLOCK_YEZIDI = 308, /*[10E80]*/ #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** @stable ICU 2.0 */ UBLOCK_INVALID_CODE=-1 }; /** @stable ICU 2.0 */ typedef enum UBlockCode UBlockCode; /** * East Asian Width constants. * * @see UCHAR_EAST_ASIAN_WIDTH * @see u_getIntPropertyValue * @stable ICU 2.2 */ typedef enum UEastAsianWidth { /* * Note: UEastAsianWidth constants are parsed by preparseucd.py. * It matches lines like * U_EA_ */ U_EA_NEUTRAL, /*[N]*/ U_EA_AMBIGUOUS, /*[A]*/ U_EA_HALFWIDTH, /*[H]*/ U_EA_FULLWIDTH, /*[F]*/ U_EA_NARROW, /*[Na]*/ U_EA_WIDE, /*[W]*/ } UEastAsianWidth; /** * Selector constants for u_charName(). * u_charName() returns the "modern" name of a * Unicode character; or the name that was defined in * Unicode version 1.0, before the Unicode standard merged * with ISO-10646; or an "extended" name that gives each * Unicode code point a unique name. * * @see u_charName * @stable ICU 2.0 */ typedef enum UCharNameChoice { /** Unicode character name (Name property). @stable ICU 2.0 */ U_UNICODE_CHAR_NAME, /** Standard or synthetic character name. @stable ICU 2.0 */ U_EXTENDED_CHAR_NAME = U_UNICODE_CHAR_NAME+2, /** Corrected name from NameAliases.txt. @stable ICU 4.4 */ U_CHAR_NAME_ALIAS, } UCharNameChoice; /** * Selector constants for u_getPropertyName() and * u_getPropertyValueName(). These selectors are used to choose which * name is returned for a given property or value. All properties and * values have a long name. Most have a short name, but some do not. * Unicode allows for additional names, beyond the long and short * name, which would be indicated by U_LONG_PROPERTY_NAME + i, where * i=1, 2,... * * @see u_getPropertyName() * @see u_getPropertyValueName() * @stable ICU 2.4 */ typedef enum UPropertyNameChoice { U_SHORT_PROPERTY_NAME, U_LONG_PROPERTY_NAME, } UPropertyNameChoice; /** * Decomposition Type constants. * * @see UCHAR_DECOMPOSITION_TYPE * @stable ICU 2.2 */ typedef enum UDecompositionType { /* * Note: UDecompositionType constants are parsed by preparseucd.py. * It matches lines like * U_DT_ */ U_DT_NONE, /*[none]*/ U_DT_CANONICAL, /*[can]*/ U_DT_COMPAT, /*[com]*/ U_DT_CIRCLE, /*[enc]*/ U_DT_FINAL, /*[fin]*/ U_DT_FONT, /*[font]*/ U_DT_FRACTION, /*[fra]*/ U_DT_INITIAL, /*[init]*/ U_DT_ISOLATED, /*[iso]*/ U_DT_MEDIAL, /*[med]*/ U_DT_NARROW, /*[nar]*/ U_DT_NOBREAK, /*[nb]*/ U_DT_SMALL, /*[sml]*/ U_DT_SQUARE, /*[sqr]*/ U_DT_SUB, /*[sub]*/ U_DT_SUPER, /*[sup]*/ U_DT_VERTICAL, /*[vert]*/ U_DT_WIDE, /*[wide]*/ } UDecompositionType; /** * Joining Type constants. * * @see UCHAR_JOINING_TYPE * @stable ICU 2.2 */ typedef enum UJoiningType { /* * Note: UJoiningType constants are parsed by preparseucd.py. * It matches lines like * U_JT_ */ U_JT_NON_JOINING, /*[U]*/ U_JT_JOIN_CAUSING, /*[C]*/ U_JT_DUAL_JOINING, /*[D]*/ U_JT_LEFT_JOINING, /*[L]*/ U_JT_RIGHT_JOINING, /*[R]*/ U_JT_TRANSPARENT, /*[T]*/ } UJoiningType; /** * Joining Group constants. * * @see UCHAR_JOINING_GROUP * @stable ICU 2.2 */ typedef enum UJoiningGroup { /* * Note: UJoiningGroup constants are parsed by preparseucd.py. * It matches lines like * U_JG_ */ U_JG_NO_JOINING_GROUP, U_JG_AIN, U_JG_ALAPH, U_JG_ALEF, U_JG_BEH, U_JG_BETH, U_JG_DAL, U_JG_DALATH_RISH, U_JG_E, U_JG_FEH, U_JG_FINAL_SEMKATH, U_JG_GAF, U_JG_GAMAL, U_JG_HAH, U_JG_TEH_MARBUTA_GOAL, /**< @stable ICU 4.6 */ U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL, U_JG_HE, U_JG_HEH, U_JG_HEH_GOAL, U_JG_HETH, U_JG_KAF, U_JG_KAPH, U_JG_KNOTTED_HEH, U_JG_LAM, U_JG_LAMADH, U_JG_MEEM, U_JG_MIM, U_JG_NOON, U_JG_NUN, U_JG_PE, U_JG_QAF, U_JG_QAPH, U_JG_REH, U_JG_REVERSED_PE, U_JG_SAD, U_JG_SADHE, U_JG_SEEN, U_JG_SEMKATH, U_JG_SHIN, U_JG_SWASH_KAF, U_JG_SYRIAC_WAW, U_JG_TAH, U_JG_TAW, U_JG_TEH_MARBUTA, U_JG_TETH, U_JG_WAW, U_JG_YEH, U_JG_YEH_BARREE, U_JG_YEH_WITH_TAIL, U_JG_YUDH, U_JG_YUDH_HE, U_JG_ZAIN, U_JG_FE, /**< @stable ICU 2.6 */ U_JG_KHAPH, /**< @stable ICU 2.6 */ U_JG_ZHAIN, /**< @stable ICU 2.6 */ U_JG_BURUSHASKI_YEH_BARREE, /**< @stable ICU 4.0 */ U_JG_FARSI_YEH, /**< @stable ICU 4.4 */ U_JG_NYA, /**< @stable ICU 4.4 */ U_JG_ROHINGYA_YEH, /**< @stable ICU 49 */ U_JG_MANICHAEAN_ALEPH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_AYIN, /**< @stable ICU 54 */ U_JG_MANICHAEAN_BETH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_DALETH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_DHAMEDH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_FIVE, /**< @stable ICU 54 */ U_JG_MANICHAEAN_GIMEL, /**< @stable ICU 54 */ U_JG_MANICHAEAN_HETH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_HUNDRED, /**< @stable ICU 54 */ U_JG_MANICHAEAN_KAPH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_LAMEDH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_MEM, /**< @stable ICU 54 */ U_JG_MANICHAEAN_NUN, /**< @stable ICU 54 */ U_JG_MANICHAEAN_ONE, /**< @stable ICU 54 */ U_JG_MANICHAEAN_PE, /**< @stable ICU 54 */ U_JG_MANICHAEAN_QOPH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_RESH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_SADHE, /**< @stable ICU 54 */ U_JG_MANICHAEAN_SAMEKH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_TAW, /**< @stable ICU 54 */ U_JG_MANICHAEAN_TEN, /**< @stable ICU 54 */ U_JG_MANICHAEAN_TETH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_THAMEDH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_TWENTY, /**< @stable ICU 54 */ U_JG_MANICHAEAN_WAW, /**< @stable ICU 54 */ U_JG_MANICHAEAN_YODH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_ZAYIN, /**< @stable ICU 54 */ U_JG_STRAIGHT_WAW, /**< @stable ICU 54 */ U_JG_AFRICAN_FEH, /**< @stable ICU 58 */ U_JG_AFRICAN_NOON, /**< @stable ICU 58 */ U_JG_AFRICAN_QAF, /**< @stable ICU 58 */ U_JG_MALAYALAM_BHA, /**< @stable ICU 60 */ U_JG_MALAYALAM_JA, /**< @stable ICU 60 */ U_JG_MALAYALAM_LLA, /**< @stable ICU 60 */ U_JG_MALAYALAM_LLLA, /**< @stable ICU 60 */ U_JG_MALAYALAM_NGA, /**< @stable ICU 60 */ U_JG_MALAYALAM_NNA, /**< @stable ICU 60 */ U_JG_MALAYALAM_NNNA, /**< @stable ICU 60 */ U_JG_MALAYALAM_NYA, /**< @stable ICU 60 */ U_JG_MALAYALAM_RA, /**< @stable ICU 60 */ U_JG_MALAYALAM_SSA, /**< @stable ICU 60 */ U_JG_MALAYALAM_TTA, /**< @stable ICU 60 */ #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) U_JG_HANIFI_ROHINGYA_KINNA_YA, /**< @stable ICU 62 */ U_JG_HANIFI_ROHINGYA_PA, /**< @stable ICU 62 */ #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) U_JG_THIN_YEH, /**< @stable ICU 70 */ U_JG_VERTICAL_TAIL, /**< @stable ICU 70 */ #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) } UJoiningGroup; /** * Grapheme Cluster Break constants. * * @see UCHAR_GRAPHEME_CLUSTER_BREAK * @stable ICU 3.4 */ typedef enum UGraphemeClusterBreak { /* * Note: UGraphemeClusterBreak constants are parsed by preparseucd.py. * It matches lines like * U_GCB_ */ U_GCB_OTHER = 0, /*[XX]*/ U_GCB_CONTROL = 1, /*[CN]*/ U_GCB_CR = 2, /*[CR]*/ U_GCB_EXTEND = 3, /*[EX]*/ U_GCB_L = 4, /*[L]*/ U_GCB_LF = 5, /*[LF]*/ U_GCB_LV = 6, /*[LV]*/ U_GCB_LVT = 7, /*[LVT]*/ U_GCB_T = 8, /*[T]*/ U_GCB_V = 9, /*[V]*/ /** @stable ICU 4.0 */ U_GCB_SPACING_MARK = 10, /*[SM]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ /** @stable ICU 4.0 */ U_GCB_PREPEND = 11, /*[PP]*/ /** @stable ICU 50 */ U_GCB_REGIONAL_INDICATOR = 12, /*[RI]*/ /* new in Unicode 6.2/ICU 50 */ /** @stable ICU 58 */ U_GCB_E_BASE = 13, /*[EB]*/ /* from here on: new in Unicode 9.0/ICU 58 */ /** @stable ICU 58 */ U_GCB_E_BASE_GAZ = 14, /*[EBG]*/ /** @stable ICU 58 */ U_GCB_E_MODIFIER = 15, /*[EM]*/ /** @stable ICU 58 */ U_GCB_GLUE_AFTER_ZWJ = 16, /*[GAZ]*/ /** @stable ICU 58 */ U_GCB_ZWJ = 17, /*[ZWJ]*/ } UGraphemeClusterBreak; /** * Word Break constants. * (UWordBreak is a pre-existing enum type in ubrk.h for word break status tags.) * * @see UCHAR_WORD_BREAK * @stable ICU 3.4 */ typedef enum UWordBreakValues { /* * Note: UWordBreakValues constants are parsed by preparseucd.py. * It matches lines like * U_WB_ */ U_WB_OTHER = 0, /*[XX]*/ U_WB_ALETTER = 1, /*[LE]*/ U_WB_FORMAT = 2, /*[FO]*/ U_WB_KATAKANA = 3, /*[KA]*/ U_WB_MIDLETTER = 4, /*[ML]*/ U_WB_MIDNUM = 5, /*[MN]*/ U_WB_NUMERIC = 6, /*[NU]*/ U_WB_EXTENDNUMLET = 7, /*[EX]*/ /** @stable ICU 4.0 */ U_WB_CR = 8, /*[CR]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ /** @stable ICU 4.0 */ U_WB_EXTEND = 9, /*[Extend]*/ /** @stable ICU 4.0 */ U_WB_LF = 10, /*[LF]*/ /** @stable ICU 4.0 */ U_WB_MIDNUMLET =11, /*[MB]*/ /** @stable ICU 4.0 */ U_WB_NEWLINE =12, /*[NL]*/ /** @stable ICU 50 */ U_WB_REGIONAL_INDICATOR = 13, /*[RI]*/ /* new in Unicode 6.2/ICU 50 */ /** @stable ICU 52 */ U_WB_HEBREW_LETTER = 14, /*[HL]*/ /* from here on: new in Unicode 6.3/ICU 52 */ /** @stable ICU 52 */ U_WB_SINGLE_QUOTE = 15, /*[SQ]*/ /** @stable ICU 52 */ U_WB_DOUBLE_QUOTE = 16, /*[DQ]*/ /** @stable ICU 58 */ U_WB_E_BASE = 17, /*[EB]*/ /* from here on: new in Unicode 9.0/ICU 58 */ /** @stable ICU 58 */ U_WB_E_BASE_GAZ = 18, /*[EBG]*/ /** @stable ICU 58 */ U_WB_E_MODIFIER = 19, /*[EM]*/ /** @stable ICU 58 */ U_WB_GLUE_AFTER_ZWJ = 20, /*[GAZ]*/ /** @stable ICU 58 */ U_WB_ZWJ = 21, /*[ZWJ]*/ #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** @stable ICU 62 */ U_WB_WSEGSPACE = 22, /*[WSEGSPACE]*/ #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) } UWordBreakValues; /** * Sentence Break constants. * * @see UCHAR_SENTENCE_BREAK * @stable ICU 3.4 */ typedef enum USentenceBreak { /* * Note: USentenceBreak constants are parsed by preparseucd.py. * It matches lines like * U_SB_ */ U_SB_OTHER = 0, /*[XX]*/ U_SB_ATERM = 1, /*[AT]*/ U_SB_CLOSE = 2, /*[CL]*/ U_SB_FORMAT = 3, /*[FO]*/ U_SB_LOWER = 4, /*[LO]*/ U_SB_NUMERIC = 5, /*[NU]*/ U_SB_OLETTER = 6, /*[LE]*/ U_SB_SEP = 7, /*[SE]*/ U_SB_SP = 8, /*[SP]*/ U_SB_STERM = 9, /*[ST]*/ U_SB_UPPER = 10, /*[UP]*/ U_SB_CR = 11, /*[CR]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ U_SB_EXTEND = 12, /*[EX]*/ U_SB_LF = 13, /*[LF]*/ U_SB_SCONTINUE = 14, /*[SC]*/ } USentenceBreak; /** * Line Break constants. * * @see UCHAR_LINE_BREAK * @stable ICU 2.2 */ typedef enum ULineBreak { /* * Note: ULineBreak constants are parsed by preparseucd.py. * It matches lines like * U_LB_ */ U_LB_UNKNOWN = 0, /*[XX]*/ U_LB_AMBIGUOUS = 1, /*[AI]*/ U_LB_ALPHABETIC = 2, /*[AL]*/ U_LB_BREAK_BOTH = 3, /*[B2]*/ U_LB_BREAK_AFTER = 4, /*[BA]*/ U_LB_BREAK_BEFORE = 5, /*[BB]*/ U_LB_MANDATORY_BREAK = 6, /*[BK]*/ U_LB_CONTINGENT_BREAK = 7, /*[CB]*/ U_LB_CLOSE_PUNCTUATION = 8, /*[CL]*/ U_LB_COMBINING_MARK = 9, /*[CM]*/ U_LB_CARRIAGE_RETURN = 10, /*[CR]*/ U_LB_EXCLAMATION = 11, /*[EX]*/ U_LB_GLUE = 12, /*[GL]*/ U_LB_HYPHEN = 13, /*[HY]*/ U_LB_IDEOGRAPHIC = 14, /*[ID]*/ /** Renamed from the misspelled "inseperable" in Unicode 4.0.1/ICU 3.0 @stable ICU 3.0 */ U_LB_INSEPARABLE = 15, /*[IN]*/ U_LB_INSEPERABLE = U_LB_INSEPARABLE, U_LB_INFIX_NUMERIC = 16, /*[IS]*/ U_LB_LINE_FEED = 17, /*[LF]*/ U_LB_NONSTARTER = 18, /*[NS]*/ U_LB_NUMERIC = 19, /*[NU]*/ U_LB_OPEN_PUNCTUATION = 20, /*[OP]*/ U_LB_POSTFIX_NUMERIC = 21, /*[PO]*/ U_LB_PREFIX_NUMERIC = 22, /*[PR]*/ U_LB_QUOTATION = 23, /*[QU]*/ U_LB_COMPLEX_CONTEXT = 24, /*[SA]*/ U_LB_SURROGATE = 25, /*[SG]*/ U_LB_SPACE = 26, /*[SP]*/ U_LB_BREAK_SYMBOLS = 27, /*[SY]*/ U_LB_ZWSPACE = 28, /*[ZW]*/ /** @stable ICU 2.6 */ U_LB_NEXT_LINE = 29, /*[NL]*/ /* from here on: new in Unicode 4/ICU 2.6 */ /** @stable ICU 2.6 */ U_LB_WORD_JOINER = 30, /*[WJ]*/ /** @stable ICU 3.4 */ U_LB_H2 = 31, /*[H2]*/ /* from here on: new in Unicode 4.1/ICU 3.4 */ /** @stable ICU 3.4 */ U_LB_H3 = 32, /*[H3]*/ /** @stable ICU 3.4 */ U_LB_JL = 33, /*[JL]*/ /** @stable ICU 3.4 */ U_LB_JT = 34, /*[JT]*/ /** @stable ICU 3.4 */ U_LB_JV = 35, /*[JV]*/ /** @stable ICU 4.4 */ U_LB_CLOSE_PARENTHESIS = 36, /*[CP]*/ /* new in Unicode 5.2/ICU 4.4 */ /** @stable ICU 49 */ U_LB_CONDITIONAL_JAPANESE_STARTER = 37,/*[CJ]*/ /* new in Unicode 6.1/ICU 49 */ /** @stable ICU 49 */ U_LB_HEBREW_LETTER = 38, /*[HL]*/ /* new in Unicode 6.1/ICU 49 */ /** @stable ICU 50 */ U_LB_REGIONAL_INDICATOR = 39,/*[RI]*/ /* new in Unicode 6.2/ICU 50 */ /** @stable ICU 58 */ U_LB_E_BASE = 40, /*[EB]*/ /* from here on: new in Unicode 9.0/ICU 58 */ /** @stable ICU 58 */ U_LB_E_MODIFIER = 41, /*[EM]*/ /** @stable ICU 58 */ U_LB_ZWJ = 42, /*[ZWJ]*/ } ULineBreak; /** * Numeric Type constants. * * @see UCHAR_NUMERIC_TYPE * @stable ICU 2.2 */ typedef enum UNumericType { /* * Note: UNumericType constants are parsed by preparseucd.py. * It matches lines like * U_NT_ */ U_NT_NONE, /*[None]*/ U_NT_DECIMAL, /*[de]*/ U_NT_DIGIT, /*[di]*/ U_NT_NUMERIC, /*[nu]*/ } UNumericType; /** * Hangul Syllable Type constants. * * @see UCHAR_HANGUL_SYLLABLE_TYPE * @stable ICU 2.6 */ typedef enum UHangulSyllableType { /* * Note: UHangulSyllableType constants are parsed by preparseucd.py. * It matches lines like * U_HST_ */ U_HST_NOT_APPLICABLE, /*[NA]*/ U_HST_LEADING_JAMO, /*[L]*/ U_HST_VOWEL_JAMO, /*[V]*/ U_HST_TRAILING_JAMO, /*[T]*/ U_HST_LV_SYLLABLE, /*[LV]*/ U_HST_LVT_SYLLABLE, /*[LVT]*/ } UHangulSyllableType; #if (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** * Indic Positional Category constants. * * @see UCHAR_INDIC_POSITIONAL_CATEGORY * @stable ICU 63 */ typedef enum UIndicPositionalCategory { /* * Note: UIndicPositionalCategory constants are parsed by preparseucd.py. * It matches lines like * U_INPC_ */ /** @stable ICU 63 */ U_INPC_NA, /** @stable ICU 63 */ U_INPC_BOTTOM, /** @stable ICU 63 */ U_INPC_BOTTOM_AND_LEFT, /** @stable ICU 63 */ U_INPC_BOTTOM_AND_RIGHT, /** @stable ICU 63 */ U_INPC_LEFT, /** @stable ICU 63 */ U_INPC_LEFT_AND_RIGHT, /** @stable ICU 63 */ U_INPC_OVERSTRUCK, /** @stable ICU 63 */ U_INPC_RIGHT, /** @stable ICU 63 */ U_INPC_TOP, /** @stable ICU 63 */ U_INPC_TOP_AND_BOTTOM, /** @stable ICU 63 */ U_INPC_TOP_AND_BOTTOM_AND_RIGHT, /** @stable ICU 63 */ U_INPC_TOP_AND_LEFT, /** @stable ICU 63 */ U_INPC_TOP_AND_LEFT_AND_RIGHT, /** @stable ICU 63 */ U_INPC_TOP_AND_RIGHT, /** @stable ICU 63 */ U_INPC_VISUAL_ORDER_LEFT, #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** @stable ICU 66 */ U_INPC_TOP_AND_BOTTOM_AND_LEFT, #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) } UIndicPositionalCategory; /** * Indic Syllabic Category constants. * * @see UCHAR_INDIC_SYLLABIC_CATEGORY * @stable ICU 63 */ typedef enum UIndicSyllabicCategory { /* * Note: UIndicSyllabicCategory constants are parsed by preparseucd.py. * It matches lines like * U_INSC_ */ /** @stable ICU 63 */ U_INSC_OTHER, /** @stable ICU 63 */ U_INSC_AVAGRAHA, /** @stable ICU 63 */ U_INSC_BINDU, /** @stable ICU 63 */ U_INSC_BRAHMI_JOINING_NUMBER, /** @stable ICU 63 */ U_INSC_CANTILLATION_MARK, /** @stable ICU 63 */ U_INSC_CONSONANT, /** @stable ICU 63 */ U_INSC_CONSONANT_DEAD, /** @stable ICU 63 */ U_INSC_CONSONANT_FINAL, /** @stable ICU 63 */ U_INSC_CONSONANT_HEAD_LETTER, /** @stable ICU 63 */ U_INSC_CONSONANT_INITIAL_POSTFIXED, /** @stable ICU 63 */ U_INSC_CONSONANT_KILLER, /** @stable ICU 63 */ U_INSC_CONSONANT_MEDIAL, /** @stable ICU 63 */ U_INSC_CONSONANT_PLACEHOLDER, /** @stable ICU 63 */ U_INSC_CONSONANT_PRECEDING_REPHA, /** @stable ICU 63 */ U_INSC_CONSONANT_PREFIXED, /** @stable ICU 63 */ U_INSC_CONSONANT_SUBJOINED, /** @stable ICU 63 */ U_INSC_CONSONANT_SUCCEEDING_REPHA, /** @stable ICU 63 */ U_INSC_CONSONANT_WITH_STACKER, /** @stable ICU 63 */ U_INSC_GEMINATION_MARK, /** @stable ICU 63 */ U_INSC_INVISIBLE_STACKER, /** @stable ICU 63 */ U_INSC_JOINER, /** @stable ICU 63 */ U_INSC_MODIFYING_LETTER, /** @stable ICU 63 */ U_INSC_NON_JOINER, /** @stable ICU 63 */ U_INSC_NUKTA, /** @stable ICU 63 */ U_INSC_NUMBER, /** @stable ICU 63 */ U_INSC_NUMBER_JOINER, /** @stable ICU 63 */ U_INSC_PURE_KILLER, /** @stable ICU 63 */ U_INSC_REGISTER_SHIFTER, /** @stable ICU 63 */ U_INSC_SYLLABLE_MODIFIER, /** @stable ICU 63 */ U_INSC_TONE_LETTER, /** @stable ICU 63 */ U_INSC_TONE_MARK, /** @stable ICU 63 */ U_INSC_VIRAMA, /** @stable ICU 63 */ U_INSC_VISARGA, /** @stable ICU 63 */ U_INSC_VOWEL, /** @stable ICU 63 */ U_INSC_VOWEL_DEPENDENT, /** @stable ICU 63 */ U_INSC_VOWEL_INDEPENDENT, } UIndicSyllabicCategory; /** * Vertical Orientation constants. * * @see UCHAR_VERTICAL_ORIENTATION * @stable ICU 63 */ typedef enum UVerticalOrientation { /* * Note: UVerticalOrientation constants are parsed by preparseucd.py. * It matches lines like * U_VO_ */ /** @stable ICU 63 */ U_VO_ROTATED, /** @stable ICU 63 */ U_VO_TRANSFORMED_ROTATED, /** @stable ICU 63 */ U_VO_TRANSFORMED_UPRIGHT, /** @stable ICU 63 */ U_VO_UPRIGHT, } UVerticalOrientation; #endif // (NTDDI_VERSION >= NTDDI_WIN10_19H1) /** * Check a binary Unicode property for a code point. * * Unicode, especially in version 3.2, defines many more properties than the * original set in UnicodeData.txt. * * The properties APIs are intended to reflect Unicode properties as defined * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). * For details about the properties see http://www.unicode.org/ucd/ . * For names of Unicode properties see the UCD file PropertyAliases.txt. * * Important: If ICU is built with UCD files from Unicode versions below 3.2, * then properties marked with "new in Unicode 3.2" are not or not fully available. * * @param c Code point to test. * @param which UProperty selector constant, identifies which binary property to check. * Must be UCHAR_BINARY_START<=which= NTDDI_WIN11_ZN) /** * Returns true if the property is true for the string. * Same as u_hasBinaryProperty(single code point, which) * if the string contains exactly one code point. * * Most properties apply only to single code points. * UTS #51 Unicode Emoji * defines several properties of strings. * * @param s String to test. * @param length Length of the string, or negative if NUL-terminated. * @param which UProperty selector constant, identifies which binary property to check. * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT. * @return true or false according to the binary Unicode property value for the string. * Also false if 'which' is out of bounds or if the Unicode version * does not have data for the property at all. * * @see UProperty * @see u_hasBinaryProperty * @see u_getBinaryPropertySet * @see u_getIntPropertyValue * @see u_getUnicodeVersion * @stable ICU 70 */ U_CAPI UBool U_EXPORT2 u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Returns a frozen USet for a binary property. * The library retains ownership over the returned object. * Sets an error code if the property number is not one for a binary property. * * The returned set contains all code points for which the property is true. * * @param property UCHAR_BINARY_START..UCHAR_BINARY_LIMIT-1 * @param pErrorCode an in/out ICU UErrorCode * @return the property as a set * @see UProperty * @see u_hasBinaryProperty * @see Unicode::fromUSet * @stable ICU 63 */ U_CAPI const USet * U_EXPORT2 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Check if a code point has the Alphabetic Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_ALPHABETIC). * This is different from u_isalpha! * @param c Code point to test * @return true if the code point has the Alphabetic Unicode property, false otherwise * * @see UCHAR_ALPHABETIC * @see u_isalpha * @see u_hasBinaryProperty * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_isUAlphabetic(UChar32 c); /** * Check if a code point has the Lowercase Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_LOWERCASE). * This is different from u_islower! * @param c Code point to test * @return true if the code point has the Lowercase Unicode property, false otherwise * * @see UCHAR_LOWERCASE * @see u_islower * @see u_hasBinaryProperty * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_isULowercase(UChar32 c); /** * Check if a code point has the Uppercase Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_UPPERCASE). * This is different from u_isupper! * @param c Code point to test * @return true if the code point has the Uppercase Unicode property, false otherwise * * @see UCHAR_UPPERCASE * @see u_isupper * @see u_hasBinaryProperty * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_isUUppercase(UChar32 c); /** * Check if a code point has the White_Space Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_WHITE_SPACE). * This is different from both u_isspace and u_isWhitespace! * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * @param c Code point to test * @return true if the code point has the White_Space Unicode property, false otherwise. * * @see UCHAR_WHITE_SPACE * @see u_isWhitespace * @see u_isspace * @see u_isJavaSpaceChar * @see u_hasBinaryProperty * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_isUWhiteSpace(UChar32 c); /** * Get the property value for an enumerated or integer Unicode property for a code point. * Also returns binary and mask property values. * * Unicode, especially in version 3.2, defines many more properties than the * original set in UnicodeData.txt. * * The properties APIs are intended to reflect Unicode properties as defined * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). * For details about the properties see http://www.unicode.org/ . * For names of Unicode properties see the UCD file PropertyAliases.txt. * * Sample usage: * UEastAsianWidth ea=(UEastAsianWidth)u_getIntPropertyValue(c, UCHAR_EAST_ASIAN_WIDTH); * UBool b=(UBool)u_getIntPropertyValue(c, UCHAR_IDEOGRAPHIC); * * @param c Code point to test. * @param which UProperty selector constant, identifies which property to check. * Must be UCHAR_BINARY_START<=which= NTDDI_WIN10_CO) /** * Returns an immutable UCPMap for an enumerated/catalog/int-valued property. * The library retains ownership over the returned object. * Sets an error code if the property number is not one for an "int property". * * The returned object maps all Unicode code points to their values for that property. * For documentation of the integer values see u_getIntPropertyValue(). * * @param property UCHAR_INT_START..UCHAR_INT_LIMIT-1 * @param pErrorCode an in/out ICU UErrorCode * @return the property as a map * @see UProperty * @see u_getIntPropertyValue * @stable ICU 63 */ U_CAPI const UCPMap * U_EXPORT2 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Get the numeric value for a Unicode code point as defined in the * Unicode Character Database. * * A "double" return type is necessary because * some numeric values are fractions, negative, or too large for int32_t. * * For characters without any numeric values in the Unicode Character Database, * this function will return U_NO_NUMERIC_VALUE. * Note: This is different from the Unicode Standard which specifies NaN as the default value. * (NaN is not available on all platforms.) * * Similar to java.lang.Character.getNumericValue(), but u_getNumericValue() * also supports negative values, large values, and fractions, * while Java's getNumericValue() returns values 10..35 for ASCII letters. * * @param c Code point to get the numeric value for. * @return Numeric value of c, or U_NO_NUMERIC_VALUE if none is defined. * * @see U_NO_NUMERIC_VALUE * @stable ICU 2.2 */ U_CAPI double U_EXPORT2 u_getNumericValue(UChar32 c); /** * Special value that is returned by u_getNumericValue when * no numeric value is defined for a code point. * * @see u_getNumericValue * @stable ICU 2.2 */ #define U_NO_NUMERIC_VALUE ((double)-123456789.) /** * Determines whether the specified code point has the general category "Ll" * (lowercase letter). * * Same as java.lang.Character.isLowerCase(). * * This misses some characters that are also lowercase but * have a different general category value. * In order to include those, use UCHAR_LOWERCASE. * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is an Ll lowercase letter * * @see UCHAR_LOWERCASE * @see u_isupper * @see u_istitle * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_islower(UChar32 c); /** * Determines whether the specified code point has the general category "Lu" * (uppercase letter). * * Same as java.lang.Character.isUpperCase(). * * This misses some characters that are also uppercase but * have a different general category value. * In order to include those, use UCHAR_UPPERCASE. * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is an Lu uppercase letter * * @see UCHAR_UPPERCASE * @see u_islower * @see u_istitle * @see u_tolower * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isupper(UChar32 c); /** * Determines whether the specified code point is a titlecase letter. * True for general category "Lt" (titlecase letter). * * Same as java.lang.Character.isTitleCase(). * * @param c the code point to be tested * @return true if the code point is an Lt titlecase letter * * @see u_isupper * @see u_islower * @see u_totitle * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_istitle(UChar32 c); /** * Determines whether the specified code point is a digit character according to Java. * True for characters with general category "Nd" (decimal digit numbers). * Beginning with Unicode 4, this is the same as * testing for the Numeric_Type of Decimal. * * Same as java.lang.Character.isDigit(). * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a digit character according to Character.isDigit() * * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isdigit(UChar32 c); /** * Determines whether the specified code point is a letter character. * True for general categories "L" (letters). * * Same as java.lang.Character.isLetter(). * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a letter character * * @see u_isdigit * @see u_isalnum * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isalpha(UChar32 c); /** * Determines whether the specified code point is an alphanumeric character * (letter or digit) according to Java. * True for characters with general categories * "L" (letters) and "Nd" (decimal digit numbers). * * Same as java.lang.Character.isLetterOrDigit(). * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is an alphanumeric character according to Character.isLetterOrDigit() * * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isalnum(UChar32 c); /** * Determines whether the specified code point is a hexadecimal digit. * This is equivalent to u_digit(c, 16)>=0. * True for characters with general category "Nd" (decimal digit numbers) * as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. * (That is, for letters with code points * 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) * * In order to narrow the definition of hexadecimal digits to only ASCII * characters, use (c<=0x7f && u_isxdigit(c)). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a hexadecimal digit * * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isxdigit(UChar32 c); /** * Determines whether the specified code point is a punctuation character. * True for characters with general categories "P" (punctuation). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a punctuation character * * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_ispunct(UChar32 c); /** * Determines whether the specified code point is a "graphic" character * (printable, excluding spaces). * true for all characters except those with general categories * "Cc" (control codes), "Cf" (format controls), "Cs" (surrogates), * "Cn" (unassigned), and "Z" (separators). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a "graphic" character * * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isgraph(UChar32 c); /** * Determines whether the specified code point is a "blank" or "horizontal space", * a character that visibly separates words on a line. * The following are equivalent definitions: * * true for Unicode White_Space characters except for "vertical space controls" * where "vertical space controls" are the following characters: * U+000A (LF) U+000B (VT) U+000C (FF) U+000D (CR) U+0085 (NEL) U+2028 (LS) U+2029 (PS) * * same as * * true for U+0009 (TAB) and characters with general category "Zs" (space separators). * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a "blank" * * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isblank(UChar32 c); /** * Determines whether the specified code point is "defined", * which usually means that it is assigned a character. * True for general categories other than "Cn" (other, not assigned), * i.e., true for all code points mentioned in UnicodeData.txt. * * Note that non-character code points (e.g., U+FDD0) are not "defined" * (they are Cn), but surrogate code points are "defined" (Cs). * * Same as java.lang.Character.isDefined(). * * @param c the code point to be tested * @return true if the code point is assigned a character * * @see u_isdigit * @see u_isalpha * @see u_isalnum * @see u_isupper * @see u_islower * @see u_istitle * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isdefined(UChar32 c); /** * Determines if the specified character is a space character or not. * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the character to be tested * @return true if the character is a space character; false otherwise. * * @see u_isJavaSpaceChar * @see u_isWhitespace * @see u_isUWhiteSpace * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isspace(UChar32 c); /** * Determine if the specified code point is a space character according to Java. * True for characters with general categories "Z" (separators), * which does not include control codes (e.g., TAB or Line Feed). * * Same as java.lang.Character.isSpaceChar(). * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * @param c the code point to be tested * @return true if the code point is a space character according to Character.isSpaceChar() * * @see u_isspace * @see u_isWhitespace * @see u_isUWhiteSpace * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isJavaSpaceChar(UChar32 c); /** * Determines if the specified code point is a whitespace character according to Java/ICU. * A character is considered to be a Java whitespace character if and only * if it satisfies one of the following criteria: * * - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), but is not * also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space or U+202F Narrow NBSP). * - It is U+0009 HORIZONTAL TABULATION. * - It is U+000A LINE FEED. * - It is U+000B VERTICAL TABULATION. * - It is U+000C FORM FEED. * - It is U+000D CARRIAGE RETURN. * - It is U+001C FILE SEPARATOR. * - It is U+001D GROUP SEPARATOR. * - It is U+001E RECORD SEPARATOR. * - It is U+001F UNIT SEPARATOR. * * This API tries to sync with the semantics of Java's * java.lang.Character.isWhitespace(), but it may not return * the exact same results because of the Unicode version * difference. * * Note: Unicode 4.0.1 changed U+200B ZERO WIDTH SPACE from a Space Separator (Zs) * to a Format Control (Cf). Since then, isWhitespace(0x200b) returns false. * See http://www.unicode.org/versions/Unicode4.0.1/ * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * @param c the code point to be tested * @return true if the code point is a whitespace character according to Java/ICU * * @see u_isspace * @see u_isJavaSpaceChar * @see u_isUWhiteSpace * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isWhitespace(UChar32 c); /** * Determines whether the specified code point is a control character * (as defined by this function). * A control character is one of the following: * - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) * - U_CONTROL_CHAR (Cc) * - U_FORMAT_CHAR (Cf) * - U_LINE_SEPARATOR (Zl) * - U_PARAGRAPH_SEPARATOR (Zp) * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a control character * * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT * @see u_isprint * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_iscntrl(UChar32 c); /** * Determines whether the specified code point is an ISO control code. * True for U+0000..U+001f and U+007f..U+009f (general category "Cc"). * * Same as java.lang.Character.isISOControl(). * * @param c the code point to be tested * @return true if the code point is an ISO control code * * @see u_iscntrl * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isISOControl(UChar32 c); /** * Determines whether the specified code point is a printable character. * True for general categories other than "C" (controls). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a printable character * * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT * @see u_iscntrl * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isprint(UChar32 c); /** * Non-standard: Determines whether the specified code point is a base character. * True for general categories "L" (letters), "N" (numbers), * "Mc" (spacing combining marks), and "Me" (enclosing marks). * * Note that this is different from the Unicode Standard definition in * chapter 3.6, conformance clause D51 “Base character”, * which defines base characters as the code points with general categories * Letter (L), Number (N), Punctuation (P), Symbol (S), or Space Separator (Zs). * * @param c the code point to be tested * @return true if the code point is a base character according to this function * * @see u_isalpha * @see u_isdigit * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isbase(UChar32 c); /** * Returns the bidirectional category value for the code point, * which is used in the Unicode bidirectional algorithm * (UAX #9 http://www.unicode.org/reports/tr9/). * Note that some unassigned code points have bidi values * of R or AL because they are in blocks that are reserved * for Right-To-Left scripts. * * Same as java.lang.Character.getDirectionality() * * @param c the code point to be tested * @return the bidirectional category (UCharDirection) value * * @see UCharDirection * @stable ICU 2.0 */ U_CAPI UCharDirection U_EXPORT2 u_charDirection(UChar32 c); /** * Determines whether the code point has the Bidi_Mirrored property. * This property is set for characters that are commonly used in * Right-To-Left contexts and need to be displayed with a "mirrored" * glyph. * * Same as java.lang.Character.isMirrored(). * Same as UCHAR_BIDI_MIRRORED * * @param c the code point to be tested * @return true if the character has the Bidi_Mirrored property * * @see UCHAR_BIDI_MIRRORED * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isMirrored(UChar32 c); /** * Maps the specified character to a "mirror-image" character. * For characters with the Bidi_Mirrored property, implementations * sometimes need a "poor man's" mapping to another Unicode * character (code point) such that the default glyph may serve * as the mirror-image of the default glyph of the specified * character. This is useful for text conversion to and from * codepages with visual order, and for displays without glyph * selection capabilities. * * @param c the code point to be mapped * @return another Unicode code point that may serve as a mirror-image * substitute, or c itself if there is no such mapping or c * does not have the Bidi_Mirrored property * * @see UCHAR_BIDI_MIRRORED * @see u_isMirrored * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_charMirror(UChar32 c); /** * Maps the specified character to its paired bracket character. * For Bidi_Paired_Bracket_Type!=None, this is the same as u_charMirror(). * Otherwise c itself is returned. * See http://www.unicode.org/reports/tr9/ * * @param c the code point to be mapped * @return the paired bracket code point, * or c itself if there is no such mapping * (Bidi_Paired_Bracket_Type=None) * * @see UCHAR_BIDI_PAIRED_BRACKET * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE * @see u_charMirror * @stable ICU 52 */ U_CAPI UChar32 U_EXPORT2 u_getBidiPairedBracket(UChar32 c); /** * Returns the general category value for the code point. * * Same as java.lang.Character.getType(). * * @param c the code point to be tested * @return the general category (UCharCategory) value * * @see UCharCategory * @stable ICU 2.0 */ U_CAPI int8_t U_EXPORT2 u_charType(UChar32 c); /** * Get a single-bit bit set for the general category of a character. * This bit set can be compared bitwise with U_GC_SM_MASK, U_GC_L_MASK, etc. * Same as U_MASK(u_charType(c)). * * @param c the code point to be tested * @return a single-bit mask corresponding to the general category (UCharCategory) value * * @see u_charType * @see UCharCategory * @see U_GC_CN_MASK * @stable ICU 2.1 */ #define U_GET_GC_MASK(c) U_MASK(u_charType(c)) /** * Callback from u_enumCharTypes(), is called for each contiguous range * of code points c (where start<=cnameChoice, the character name written * into the buffer is the "modern" name or the name that was defined * in Unicode version 1.0. * The name contains only "invariant" characters * like A-Z, 0-9, space, and '-'. * Unicode 1.0 names are only retrieved if they are different from the modern * names and if the data file contains the data for them. gennames may or may * not be called with a command line option to include 1.0 names in unames.dat. * * @param code The character (code point) for which to get the name. * It must be 0<=code<=0x10ffff. * @param nameChoice Selector for which name to get. * @param buffer Destination address for copying the name. * The name will always be zero-terminated. * If there is no name, then the buffer will be set to the empty string. * @param bufferLength ==sizeof(buffer) * @param pErrorCode Pointer to a UErrorCode variable; * check for U_SUCCESS() after u_charName() * returns. * @return The length of the name, or 0 if there is no name for this character. * If the bufferLength is less than or equal to the length, then the buffer * contains the truncated name and the returned length indicates the full * length of the name. * The length does not include the zero-termination. * * @see UCharNameChoice * @see u_charFromName * @see u_enumCharNames * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode); /** * Find a Unicode character by its name and return its code point value. * The name is matched exactly and completely. * If the name does not correspond to a code point, pErrorCode * is set to U_INVALID_CHAR_FOUND. * A Unicode 1.0 name is matched only if it differs from the modern name. * Unicode names are all uppercase. Extended names are lowercase followed * by an uppercase hexadecimal number, and within angle brackets. * * @param nameChoice Selector for which name to match. * @param name The name to match. * @param pErrorCode Pointer to a UErrorCode variable * @return The Unicode value of the code point with the given name, * or an undefined value if there is no such code point. * * @see UCharNameChoice * @see u_charName * @see u_enumCharNames * @stable ICU 1.7 */ U_CAPI UChar32 U_EXPORT2 u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode); /** * Type of a callback function for u_enumCharNames() that gets called * for each Unicode character with the code point value and * the character name. * If such a function returns false, then the enumeration is stopped. * * @param context The context pointer that was passed to u_enumCharNames(). * @param code The Unicode code point for the character with this name. * @param nameChoice Selector for which kind of names is enumerated. * @param name The character's name, zero-terminated. * @param length The length of the name. * @return true if the enumeration should continue, false to stop it. * * @see UCharNameChoice * @see u_enumCharNames * @stable ICU 1.7 */ typedef UBool U_CALLCONV UEnumCharNamesFn(void *context, UChar32 code, UCharNameChoice nameChoice, const char *name, int32_t length); /** * Enumerate all assigned Unicode characters between the start and limit * code points (start inclusive, limit exclusive) and call a function * for each, passing the code point value and the character name. * For Unicode 1.0 names, only those are enumerated that differ from the * modern names. * * @param start The first code point in the enumeration range. * @param limit One more than the last code point in the enumeration range * (the first one after the range). * @param fn The function that is to be called for each character name. * @param context An arbitrary pointer that is passed to the function. * @param nameChoice Selector for which kind of names to enumerate. * @param pErrorCode Pointer to a UErrorCode variable * * @see UCharNameChoice * @see UEnumCharNamesFn * @see u_charName * @see u_charFromName * @stable ICU 1.7 */ U_CAPI void U_EXPORT2 u_enumCharNames(UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice, UErrorCode *pErrorCode); /** * Return the Unicode name for a given property, as given in the * Unicode database file PropertyAliases.txt. * * In addition, this function maps the property * UCHAR_GENERAL_CATEGORY_MASK to the synthetic names "gcm" / * "General_Category_Mask". These names are not in * PropertyAliases.txt. * * @param property UProperty selector other than UCHAR_INVALID_CODE. * If out of range, NULL is returned. * * @param nameChoice selector for which name to get. If out of range, * NULL is returned. All properties have a long name. Most * have a short name, but some do not. Unicode allows for * additional names; if present these will be returned by * U_LONG_PROPERTY_NAME + i, where i=1, 2,... * * @return a pointer to the name, or NULL if either the * property or the nameChoice is out of range. If a given * nameChoice returns NULL, then all larger values of * nameChoice will return NULL, with one exception: if NULL is * returned for U_SHORT_PROPERTY_NAME, then * U_LONG_PROPERTY_NAME (and higher) may still return a * non-NULL value. The returned pointer is valid until * u_cleanup() is called. * * @see UProperty * @see UPropertyNameChoice * @stable ICU 2.4 */ U_CAPI const char* U_EXPORT2 u_getPropertyName(UProperty property, UPropertyNameChoice nameChoice); /** * Return the UProperty enum for a given property name, as specified * in the Unicode database file PropertyAliases.txt. Short, long, and * any other variants are recognized. * * In addition, this function maps the synthetic names "gcm" / * "General_Category_Mask" to the property * UCHAR_GENERAL_CATEGORY_MASK. These names are not in * PropertyAliases.txt. * * @param alias the property name to be matched. The name is compared * using "loose matching" as described in PropertyAliases.txt. * * @return a UProperty enum, or UCHAR_INVALID_CODE if the given name * does not match any property. * * @see UProperty * @stable ICU 2.4 */ U_CAPI UProperty U_EXPORT2 u_getPropertyEnum(const char* alias); /** * Return the Unicode name for a given property value, as given in the * Unicode database file PropertyValueAliases.txt. * * Note: Some of the names in PropertyValueAliases.txt can only be * retrieved using UCHAR_GENERAL_CATEGORY_MASK, not * UCHAR_GENERAL_CATEGORY. These include: "C" / "Other", "L" / * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P" * / "Punctuation", "S" / "Symbol", and "Z" / "Separator". * * @param property UProperty selector constant. * Must be UCHAR_BINARY_START<=which2<=radix<=36 or if the * value of c is not a valid digit in the specified * radix, -1 is returned. A character is a valid digit * if at least one of the following is true: *
    *
  • The character has a decimal digit value. * Such characters have the general category "Nd" (decimal digit numbers) * and a Numeric_Type of Decimal. * In this case the value is the character's decimal digit value.
  • *
  • The character is one of the uppercase Latin letters * 'A' through 'Z'. * In this case the value is c-'A'+10.
  • *
  • The character is one of the lowercase Latin letters * 'a' through 'z'. * In this case the value is ch-'a'+10.
  • *
  • Latin letters from both the ASCII range (0061..007A, 0041..005A) * as well as from the Fullwidth ASCII range (FF41..FF5A, FF21..FF3A) * are recognized.
  • *
* * Same as java.lang.Character.digit(). * * @param ch the code point to be tested. * @param radix the radix. * @return the numeric value represented by the character in the * specified radix, * or -1 if there is no value or if the value exceeds the radix. * * @see UCHAR_NUMERIC_TYPE * @see u_forDigit * @see u_charDigitValue * @see u_isdigit * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_digit(UChar32 ch, int8_t radix); /** * Determines the character representation for a specific digit in * the specified radix. If the value of radix is not a * valid radix, or the value of digit is not a valid * digit in the specified radix, the null character * (U+0000) is returned. *

* The radix argument is valid if it is greater than or * equal to 2 and less than or equal to 36. * The digit argument is valid if * 0 <= digit < radix. *

* If the digit is less than 10, then * '0' + digit is returned. Otherwise, the value * 'a' + digit - 10 is returned. * * Same as java.lang.Character.forDigit(). * * @param digit the number to convert to a character. * @param radix the radix. * @return the char representation of the specified digit * in the specified radix. * * @see u_digit * @see u_charDigitValue * @see u_isdigit * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_forDigit(int32_t digit, int8_t radix); /** * Get the "age" of the code point. * The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) * or assigned a character. * This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters. * The data is from the UCD file DerivedAge.txt. * * @param c The code point. * @param versionArray The Unicode version number array, to be filled in. * * @stable ICU 2.1 */ U_CAPI void U_EXPORT2 u_charAge(UChar32 c, UVersionInfo versionArray); /** * Gets the Unicode version information. * The version array is filled in with the version information * for the Unicode standard that is currently used by ICU. * For example, Unicode version 3.1.1 is represented as an array with * the values { 3, 1, 1, 0 }. * * @param versionArray an output array that will be filled in with * the Unicode version number * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 u_getUnicodeVersion(UVersionInfo versionArray); #if !UCONFIG_NO_NORMALIZATION /** * Get the FC_NFKC_Closure property string for a character. * See Unicode Standard Annex #15 for details, search for "FC_NFKC_Closure" * or for "FNC": http://www.unicode.org/reports/tr15/ * * @param c The character (code point) for which to get the FC_NFKC_Closure string. * It must be 0<=c<=0x10ffff. * @param dest Destination address for copying the string. * The string will be zero-terminated if possible. * If there is no FC_NFKC_Closure string, * then the buffer will be set to the empty string. * @param destCapacity ==sizeof(dest) * @param pErrorCode Pointer to a UErrorCode variable. * @return The length of the string, or 0 if there is no FC_NFKC_Closure string for this character. * If the destCapacity is less than or equal to the length, then the buffer * contains the truncated name and the returned length indicates the full * length of the name. * The length does not include the zero-termination. * * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode); #endif U_CDECL_END #endif /*_UCHAR*/ /*eof*/ // ubidi.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 1999-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: ubidi.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 1999jul27 * created by: Markus W. Scherer, updated by Matitiahu Allouche */ #ifndef UBIDI_H #define UBIDI_H /** *\file * \brief C API: Bidi algorithm * *

Bidi algorithm for ICU

* * This is an implementation of the Unicode Bidirectional Algorithm. * The algorithm is defined in the * Unicode Standard Annex #9.

* * Note: Libraries that perform a bidirectional algorithm and * reorder strings accordingly are sometimes called "Storage Layout Engines". * ICU's Bidi and shaping (u_shapeArabic()) APIs can be used at the core of such * "Storage Layout Engines". * *

General remarks about the API:

* * In functions with an error code parameter, * the pErrorCode pointer must be valid * and the value that it points to must not indicate a failure before * the function call. Otherwise, the function returns immediately. * After the function call, the value indicates success or failure.

* * The "limit" of a sequence of characters is the position just after their * last character, i.e., one more than that position.

* * Some of the API functions provide access to "runs". * Such a "run" is defined as a sequence of characters * that are at the same embedding level * after performing the Bidi algorithm.

* * @author Markus W. Scherer * @version 1.0 * * *

Sample code for the ICU Bidi API

* *
Rendering a paragraph with the ICU Bidi API
* * This is (hypothetical) sample code that illustrates * how the ICU Bidi API could be used to render a paragraph of text. * Rendering code depends highly on the graphics system, * therefore this sample code must make a lot of assumptions, * which may or may not match any existing graphics system's properties. * *

The basic assumptions are:

*
    *
  • Rendering is done from left to right on a horizontal line.
  • *
  • A run of single-style, unidirectional text can be rendered at once.
  • *
  • Such a run of text is passed to the graphics system with * characters (code units) in logical order.
  • *
  • The line-breaking algorithm is very complicated * and Locale-dependent - * and therefore its implementation omitted from this sample code.
  • *
* *
 * \code
 *#include 
 *
 *typedef enum {
 *     styleNormal=0, styleSelected=1,
 *     styleBold=2, styleItalics=4,
 *     styleSuper=8, styleSub=16
 *} Style;
 *
 *typedef struct { int32_t limit; Style style; } StyleRun;
 *
 *int getTextWidth(const UChar *text, int32_t start, int32_t limit,
 *                  const StyleRun *styleRuns, int styleRunCount);
 *
 * // set *pLimit and *pStyleRunLimit for a line
 * // from text[start] and from styleRuns[styleRunStart]
 * // using ubidi_getLogicalRun(para, ...)
 *void getLineBreak(const UChar *text, int32_t start, int32_t *pLimit,
 *                  UBiDi *para,
 *                  const StyleRun *styleRuns, int styleRunStart, int *pStyleRunLimit,
 *                  int *pLineWidth);
 *
 * // render runs on a line sequentially, always from left to right
 *
 * // prepare rendering a new line
 * void startLine(UBiDiDirection textDirection, int lineWidth);
 *
 * // render a run of text and advance to the right by the run width
 * // the text[start..limit-1] is always in logical order
 * void renderRun(const UChar *text, int32_t start, int32_t limit,
 *               UBiDiDirection textDirection, Style style);
 *
 * // We could compute a cross-product
 * // from the style runs with the directional runs
 * // and then reorder it.
 * // Instead, here we iterate over each run type
 * // and render the intersections -
 * // with shortcuts in simple (and common) cases.
 * // renderParagraph() is the main function.
 *
 * // render a directional run with
 * // (possibly) multiple style runs intersecting with it
 * void renderDirectionalRun(const UChar *text,
 *                           int32_t start, int32_t limit,
 *                           UBiDiDirection direction,
 *                           const StyleRun *styleRuns, int styleRunCount) {
 *     int i;
 *
 *     // iterate over style runs
 *     if(direction==UBIDI_LTR) {
 *         int styleLimit;
 *
 *         for(i=0; ilimit) { styleLimit=limit; }
 *                 renderRun(text, start, styleLimit,
 *                           direction, styleRuns[i].style);
 *                 if(styleLimit==limit) { break; }
 *                 start=styleLimit;
 *             }
 *         }
 *     } else {
 *         int styleStart;
 *
 *         for(i=styleRunCount-1; i>=0; --i) {
 *             if(i>0) {
 *                 styleStart=styleRuns[i-1].limit;
 *             } else {
 *                 styleStart=0;
 *             }
 *             if(limit>=styleStart) {
 *                 if(styleStart=length
 *
 *         width=getTextWidth(text, 0, length, styleRuns, styleRunCount);
 *         if(width<=lineWidth) {
 *             // everything fits onto one line
 *
 *            // prepare rendering a new line from either left or right
 *             startLine(paraLevel, width);
 *
 *             renderLine(para, text, 0, length,
 *                        styleRuns, styleRunCount, pErrorCode);
 *         } else {
 *             UBiDi *line;
 *
 *             // we need to render several lines
 *             line=ubidi_openSized(length, 0, pErrorCode);
 *             if(line!=NULL) {
 *                 int32_t start=0, limit;
 *                 int styleRunStart=0, styleRunLimit;
 *
 *                 for(;;) {
 *                     limit=length;
 *                     styleRunLimit=styleRunCount;
 *                     getLineBreak(text, start, &limit, para,
 *                                  styleRuns, styleRunStart, &styleRunLimit,
 *                                 &width);
 *                     ubidi_setLine(para, start, limit, line, pErrorCode);
 *                     if(U_SUCCESS(*pErrorCode)) {
 *                         // prepare rendering a new line
 *                         // from either left or right
 *                         startLine(paraLevel, width);
 *
 *                         renderLine(line, text, start, limit,
 *                                    styleRuns+styleRunStart,
 *                                    styleRunLimit-styleRunStart, pErrorCode);
 *                     }
 *                     if(limit==length) { break; }
 *                     start=limit;
 *                     styleRunStart=styleRunLimit-1;
 *                     if(start>=styleRuns[styleRunStart].limit) {
 *                         ++styleRunStart;
 *                     }
 *                 }
 *
 *                 ubidi_close(line);
 *             }
 *        }
 *    }
 *
 *     ubidi_close(para);
 *}
 *\endcode
 * 
*/ /*DOCXX_TAG*/ /*@{*/ /** * UBiDiLevel is the type of the level values in this * Bidi implementation. * It holds an embedding level and indicates the visual direction * by its bit 0 (even/odd value).

* * It can also hold non-level values for the * paraLevel and embeddingLevels * arguments of ubidi_setPara(); there: *

    *
  • bit 7 of an embeddingLevels[] * value indicates whether the using application is * specifying the level of a character to override whatever the * Bidi implementation would resolve it to.
  • *
  • paraLevel can be set to the * pseudo-level values UBIDI_DEFAULT_LTR * and UBIDI_DEFAULT_RTL.
  • *
* * @see ubidi_setPara * *

The related constants are not real, valid level values. * UBIDI_DEFAULT_XXX can be used to specify * a default for the paragraph level for * when the ubidi_setPara() function * shall determine it but there is no * strongly typed character in the input.

* * Note that the value for UBIDI_DEFAULT_LTR is even * and the one for UBIDI_DEFAULT_RTL is odd, * just like with normal LTR and RTL level values - * these special values are designed that way. Also, the implementation * assumes that UBIDI_MAX_EXPLICIT_LEVEL is odd. * * Note: The numeric values of the related constants will not change: * They are tied to the use of 7-bit byte values (plus the override bit) * and of the UBiDiLevel=uint8_t data type in this API. * * @see UBIDI_DEFAULT_LTR * @see UBIDI_DEFAULT_RTL * @see UBIDI_LEVEL_OVERRIDE * @see UBIDI_MAX_EXPLICIT_LEVEL * @stable ICU 2.0 */ typedef uint8_t UBiDiLevel; /** Paragraph level setting.

* * Constant indicating that the base direction depends on the first strong * directional character in the text according to the Unicode Bidirectional * Algorithm. If no strong directional character is present, * then set the paragraph level to 0 (left-to-right).

* * If this value is used in conjunction with reordering modes * UBIDI_REORDER_INVERSE_LIKE_DIRECT or * UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the text to reorder * is assumed to be visual LTR, and the text after reordering is required * to be the corresponding logical string with appropriate contextual * direction. The direction of the result string will be RTL if either * the righmost or leftmost strong character of the source text is RTL * or Arabic Letter, the direction will be LTR otherwise.

* * If reordering option UBIDI_OPTION_INSERT_MARKS is set, an RLM may * be added at the beginning of the result string to ensure round trip * (that the result string, when reordered back to visual, will produce * the original source text). * @see UBIDI_REORDER_INVERSE_LIKE_DIRECT * @see UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL * @stable ICU 2.0 */ #define UBIDI_DEFAULT_LTR 0xfe /** Paragraph level setting.

* * Constant indicating that the base direction depends on the first strong * directional character in the text according to the Unicode Bidirectional * Algorithm. If no strong directional character is present, * then set the paragraph level to 1 (right-to-left).

* * If this value is used in conjunction with reordering modes * UBIDI_REORDER_INVERSE_LIKE_DIRECT or * UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the text to reorder * is assumed to be visual LTR, and the text after reordering is required * to be the corresponding logical string with appropriate contextual * direction. The direction of the result string will be RTL if either * the righmost or leftmost strong character of the source text is RTL * or Arabic Letter, or if the text contains no strong character; * the direction will be LTR otherwise.

* * If reordering option UBIDI_OPTION_INSERT_MARKS is set, an RLM may * be added at the beginning of the result string to ensure round trip * (that the result string, when reordered back to visual, will produce * the original source text). * @see UBIDI_REORDER_INVERSE_LIKE_DIRECT * @see UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL * @stable ICU 2.0 */ #define UBIDI_DEFAULT_RTL 0xff /** * Maximum explicit embedding level. * Same as the max_depth value in the * Unicode Bidirectional Algorithm. * (The maximum resolved level can be up to UBIDI_MAX_EXPLICIT_LEVEL+1). * @stable ICU 2.0 */ #define UBIDI_MAX_EXPLICIT_LEVEL 125 /** Bit flag for level input. * Overrides directional properties. * @stable ICU 2.0 */ #define UBIDI_LEVEL_OVERRIDE 0x80 /** * Special value which can be returned by the mapping functions when a logical * index has no corresponding visual index or vice-versa. This may happen * for the logical-to-visual mapping of a Bidi control when option * #UBIDI_OPTION_REMOVE_CONTROLS is specified. This can also happen * for the visual-to-logical mapping of a Bidi mark (LRM or RLM) inserted * by option #UBIDI_OPTION_INSERT_MARKS. * @see ubidi_getVisualIndex * @see ubidi_getVisualMap * @see ubidi_getLogicalIndex * @see ubidi_getLogicalMap * @stable ICU 3.6 */ #define UBIDI_MAP_NOWHERE (-1) /** * UBiDiDirection values indicate the text direction. * @stable ICU 2.0 */ enum UBiDiDirection { /** Left-to-right text. This is a 0 value. *

    *
  • As return value for ubidi_getDirection(), it means * that the source string contains no right-to-left characters, or * that the source string is empty and the paragraph level is even. *
  • As return value for ubidi_getBaseDirection(), it * means that the first strong character of the source string has * a left-to-right direction. *
* @stable ICU 2.0 */ UBIDI_LTR, /** Right-to-left text. This is a 1 value. *
    *
  • As return value for ubidi_getDirection(), it means * that the source string contains no left-to-right characters, or * that the source string is empty and the paragraph level is odd. *
  • As return value for ubidi_getBaseDirection(), it * means that the first strong character of the source string has * a right-to-left direction. *
* @stable ICU 2.0 */ UBIDI_RTL, /** Mixed-directional text. *

As return value for ubidi_getDirection(), it means * that the source string contains both left-to-right and * right-to-left characters. * @stable ICU 2.0 */ UBIDI_MIXED, /** No strongly directional text. *

As return value for ubidi_getBaseDirection(), it means * that the source string is missing or empty, or contains neither left-to-right * nor right-to-left characters. * @stable ICU 4.6 */ UBIDI_NEUTRAL }; /** @stable ICU 2.0 */ typedef enum UBiDiDirection UBiDiDirection; /** * Forward declaration of the UBiDi structure for the declaration of * the API functions. Its fields are implementation-specific.

* This structure holds information about a paragraph (or multiple paragraphs) * of text with Bidi-algorithm-related details, or about one line of * such a paragraph.

* Reordering can be done on a line, or on one or more paragraphs which are * then interpreted each as one single line. * @stable ICU 2.0 */ struct UBiDi; /** @stable ICU 2.0 */ typedef struct UBiDi UBiDi; /** * Allocate a UBiDi structure. * Such an object is initially empty. It is assigned * the Bidi properties of a piece of text containing one or more paragraphs * by ubidi_setPara() * or the Bidi properties of a line within a paragraph by * ubidi_setLine().

* This object can be reused for as long as it is not deallocated * by calling ubidi_close().

* ubidi_setPara() and ubidi_setLine() will allocate * additional memory for internal structures as necessary. * * @return An empty UBiDi object. * @stable ICU 2.0 */ U_CAPI UBiDi * U_EXPORT2 ubidi_open(void); /** * Allocate a UBiDi structure with preallocated memory * for internal structures. * This function provides a UBiDi object like ubidi_open() * with no arguments, but it also preallocates memory for internal structures * according to the sizings supplied by the caller.

* Subsequent functions will not allocate any more memory, and are thus * guaranteed not to fail because of lack of memory.

* The preallocation can be limited to some of the internal memory * by setting some values to 0 here. That means that if, e.g., * maxRunCount cannot be reasonably predetermined and should not * be set to maxLength (the only failproof value) to avoid * wasting memory, then maxRunCount could be set to 0 here * and the internal structures that are associated with it will be allocated * on demand, just like with ubidi_open(). * * @param maxLength is the maximum text or line length that internal memory * will be preallocated for. An attempt to associate this object with a * longer text will fail, unless this value is 0, which leaves the allocation * up to the implementation. * * @param maxRunCount is the maximum anticipated number of same-level runs * that internal memory will be preallocated for. An attempt to access * visual runs on an object that was not preallocated for as many runs * as the text was actually resolved to will fail, * unless this value is 0, which leaves the allocation up to the implementation.

* The number of runs depends on the actual text and maybe anywhere between * 1 and maxLength. It is typically small. * * @param pErrorCode must be a valid pointer to an error code value. * * @return An empty UBiDi object with preallocated memory. * @stable ICU 2.0 */ U_CAPI UBiDi * U_EXPORT2 ubidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode *pErrorCode); /** * ubidi_close() must be called to free the memory * associated with a UBiDi object.

* * Important: * A parent UBiDi object must not be destroyed or reused if * it still has children. * If a UBiDi object has become the child * of another one (its parent) by calling * ubidi_setLine(), then the child object must * be destroyed (closed) or reused (by calling * ubidi_setPara() or ubidi_setLine()) * before the parent object. * * @param pBiDi is a UBiDi object. * * @see ubidi_setPara * @see ubidi_setLine * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_close(UBiDi *pBiDi); /** * Modify the operation of the Bidi algorithm such that it * approximates an "inverse Bidi" algorithm. This function * must be called before ubidi_setPara(). * *

The normal operation of the Bidi algorithm as described * in the Unicode Technical Report is to take text stored in logical * (keyboard, typing) order and to determine the reordering of it for visual * rendering. * Some legacy systems store text in visual order, and for operations * with standard, Unicode-based algorithms, the text needs to be transformed * to logical order. This is effectively the inverse algorithm of the * described Bidi algorithm. Note that there is no standard algorithm for * this "inverse Bidi" and that the current implementation provides only an * approximation of "inverse Bidi".

* *

With isInverse set to true, * this function changes the behavior of some of the subsequent functions * in a way that they can be used for the inverse Bidi algorithm. * Specifically, runs of text with numeric characters will be treated in a * special way and may need to be surrounded with LRM characters when they are * written in reordered sequence.

* *

Output runs should be retrieved using ubidi_getVisualRun(). * Since the actual input for "inverse Bidi" is visually ordered text and * ubidi_getVisualRun() gets the reordered runs, these are actually * the runs of the logically ordered output.

* *

Calling this function with argument isInverse set to * true is equivalent to calling * ubidi_setReorderingMode with argument * reorderingMode * set to #UBIDI_REORDER_INVERSE_NUMBERS_AS_L.
* Calling this function with argument isInverse set to * false is equivalent to calling * ubidi_setReorderingMode with argument * reorderingMode * set to #UBIDI_REORDER_DEFAULT. * * @param pBiDi is a UBiDi object. * * @param isInverse specifies "forward" or "inverse" Bidi operation. * * @see ubidi_setPara * @see ubidi_writeReordered * @see ubidi_setReorderingMode * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_setInverse(UBiDi *pBiDi, UBool isInverse); /** * Is this Bidi object set to perform the inverse Bidi algorithm? *

Note: calling this function after setting the reordering mode with * ubidi_setReorderingMode will return true if the * reordering mode was set to #UBIDI_REORDER_INVERSE_NUMBERS_AS_L, * false for all other values.

* * @param pBiDi is a UBiDi object. * @return true if the Bidi object is set to perform the inverse Bidi algorithm * by handling numbers as L. * * @see ubidi_setInverse * @see ubidi_setReorderingMode * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ubidi_isInverse(UBiDi *pBiDi); /** * Specify whether block separators must be allocated level zero, * so that successive paragraphs will progress from left to right. * This function must be called before ubidi_setPara(). * Paragraph separators (B) may appear in the text. Setting them to level zero * means that all paragraph separators (including one possibly appearing * in the last text position) are kept in the reordered text after the text * that they follow in the source text. * When this feature is not enabled, a paragraph separator at the last * position of the text before reordering will go to the first position * of the reordered text when the paragraph level is odd. * * @param pBiDi is a UBiDi object. * * @param orderParagraphsLTR specifies whether paragraph separators (B) must * receive level 0, so that successive paragraphs progress from left to right. * * @see ubidi_setPara * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ubidi_orderParagraphsLTR(UBiDi *pBiDi, UBool orderParagraphsLTR); /** * Is this Bidi object set to allocate level 0 to block separators so that * successive paragraphs progress from left to right? * * @param pBiDi is a UBiDi object. * @return true if the Bidi object is set to allocate level 0 to block * separators. * * @see ubidi_orderParagraphsLTR * @stable ICU 3.4 */ U_CAPI UBool U_EXPORT2 ubidi_isOrderParagraphsLTR(UBiDi *pBiDi); /** * UBiDiReorderingMode values indicate which variant of the Bidi * algorithm to use. * * @see ubidi_setReorderingMode * @stable ICU 3.6 */ typedef enum UBiDiReorderingMode { /** Regular Logical to Visual Bidi algorithm according to Unicode. * This is a 0 value. * @stable ICU 3.6 */ UBIDI_REORDER_DEFAULT = 0, /** Logical to Visual algorithm which handles numbers in a way which * mimics the behavior of Windows XP. * @stable ICU 3.6 */ UBIDI_REORDER_NUMBERS_SPECIAL, /** Logical to Visual algorithm grouping numbers with adjacent R characters * (reversible algorithm). * @stable ICU 3.6 */ UBIDI_REORDER_GROUP_NUMBERS_WITH_R, /** Reorder runs only to transform a Logical LTR string to the Logical RTL * string with the same display, or vice-versa.
* If this mode is set together with option * #UBIDI_OPTION_INSERT_MARKS, some Bidi controls in the source * text may be removed and other controls may be added to produce the * minimum combination which has the required display. * @stable ICU 3.6 */ UBIDI_REORDER_RUNS_ONLY, /** Visual to Logical algorithm which handles numbers like L * (same algorithm as selected by ubidi_setInverse(true). * @see ubidi_setInverse * @stable ICU 3.6 */ UBIDI_REORDER_INVERSE_NUMBERS_AS_L, /** Visual to Logical algorithm equivalent to the regular Logical to Visual * algorithm. * @stable ICU 3.6 */ UBIDI_REORDER_INVERSE_LIKE_DIRECT, /** Inverse Bidi (Visual to Logical) algorithm for the * UBIDI_REORDER_NUMBERS_SPECIAL Bidi algorithm. * @stable ICU 3.6 */ UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL, } UBiDiReorderingMode; /** * Modify the operation of the Bidi algorithm such that it implements some * variant to the basic Bidi algorithm or approximates an "inverse Bidi" * algorithm, depending on different values of the "reordering mode". * This function must be called before ubidi_setPara(), and stays * in effect until called again with a different argument. * *

The normal operation of the Bidi algorithm as described * in the Unicode Standard Annex #9 is to take text stored in logical * (keyboard, typing) order and to determine how to reorder it for visual * rendering.

* *

With the reordering mode set to a value other than * #UBIDI_REORDER_DEFAULT, this function changes the behavior of * some of the subsequent functions in a way such that they implement an * inverse Bidi algorithm or some other algorithm variants.

* *

Some legacy systems store text in visual order, and for operations * with standard, Unicode-based algorithms, the text needs to be transformed * into logical order. This is effectively the inverse algorithm of the * described Bidi algorithm. Note that there is no standard algorithm for * this "inverse Bidi", so a number of variants are implemented here.

* *

In other cases, it may be desirable to emulate some variant of the * Logical to Visual algorithm (e.g. one used in MS Windows), or perform a * Logical to Logical transformation.

* *
    *
  • When the reordering mode is set to #UBIDI_REORDER_DEFAULT, * the standard Bidi Logical to Visual algorithm is applied.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_NUMBERS_SPECIAL, * the algorithm used to perform Bidi transformations when calling * ubidi_setPara should approximate the algorithm used in * Microsoft Windows XP rather than strictly conform to the Unicode Bidi * algorithm. *
    * The differences between the basic algorithm and the algorithm addressed * by this option are as follows: *
      *
    • Within text at an even embedding level, the sequence "123AB" * (where AB represent R or AL letters) is transformed to "123BA" by the * Unicode algorithm and to "BA123" by the Windows algorithm.
    • *
    • Arabic-Indic numbers (AN) are handled by the Windows algorithm just * like regular numbers (EN).
    • *
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_GROUP_NUMBERS_WITH_R, * numbers located between LTR text and RTL text are associated with the RTL * text. For instance, an LTR paragraph with content "abc 123 DEF" (where * upper case letters represent RTL characters) will be transformed to * "abc FED 123" (and not "abc 123 FED"), "DEF 123 abc" will be transformed * to "123 FED abc" and "123 FED abc" will be transformed to "DEF 123 abc". * This makes the algorithm reversible and makes it useful when round trip * (from visual to logical and back to visual) must be achieved without * adding LRM characters. However, this is a variation from the standard * Unicode Bidi algorithm.
    * The source text should not contain Bidi control characters other than LRM * or RLM.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_RUNS_ONLY, * a "Logical to Logical" transformation must be performed: *
      *
    • If the default text level of the source text (argument paraLevel * in ubidi_setPara) is even, the source text will be handled as * LTR logical text and will be transformed to the RTL logical text which has * the same LTR visual display.
    • *
    • If the default level of the source text is odd, the source text * will be handled as RTL logical text and will be transformed to the * LTR logical text which has the same LTR visual display.
    • *
    * This mode may be needed when logical text which is basically Arabic or * Hebrew, with possible included numbers or phrases in English, has to be * displayed as if it had an even embedding level (this can happen if the * displaying application treats all text as if it was basically LTR). *
    * This mode may also be needed in the reverse case, when logical text which is * basically English, with possible included phrases in Arabic or Hebrew, has to * be displayed as if it had an odd embedding level. *
    * Both cases could be handled by adding LRE or RLE at the head of the text, * if the display subsystem supports these formatting controls. If it does not, * the problem may be handled by transforming the source text in this mode * before displaying it, so that it will be displayed properly.
    * The source text should not contain Bidi control characters other than LRM * or RLM.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L, an "inverse Bidi" algorithm * is applied. * Runs of text with numeric characters will be treated like LTR letters and * may need to be surrounded with LRM characters when they are written in * reordered sequence (the option #UBIDI_INSERT_LRM_FOR_NUMERIC can * be used with function ubidi_writeReordered to this end. This * mode is equivalent to calling ubidi_setInverse() with * argument isInverse set to true.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_INVERSE_LIKE_DIRECT, the "direct" Logical to Visual * Bidi algorithm is used as an approximation of an "inverse Bidi" algorithm. * This mode is similar to mode #UBIDI_REORDER_INVERSE_NUMBERS_AS_L * but is closer to the regular Bidi algorithm. *
    * For example, an LTR paragraph with the content "FED 123 456 CBA" (where * upper case represents RTL characters) will be transformed to * "ABC 456 123 DEF", as opposed to "DEF 123 456 ABC" * with mode UBIDI_REORDER_INVERSE_NUMBERS_AS_L.
    * When used in conjunction with option * #UBIDI_OPTION_INSERT_MARKS, this mode generally * adds Bidi marks to the output significantly more sparingly than mode * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L with option * #UBIDI_INSERT_LRM_FOR_NUMERIC in calls to * ubidi_writeReordered.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the Logical to Visual * Bidi algorithm used in Windows XP is used as an approximation of an "inverse Bidi" algorithm. *
    * For example, an LTR paragraph with the content "abc FED123" (where * upper case represents RTL characters) will be transformed to "abc 123DEF."
  • *
* *

In all the reordering modes specifying an "inverse Bidi" algorithm * (i.e. those with a name starting with UBIDI_REORDER_INVERSE), * output runs should be retrieved using * ubidi_getVisualRun(), and the output text with * ubidi_writeReordered(). The caller should keep in mind that in * "inverse Bidi" modes the input is actually visually ordered text and * reordered output returned by ubidi_getVisualRun() or * ubidi_writeReordered() are actually runs or character string * of logically ordered output.
* For all the "inverse Bidi" modes, the source text should not contain * Bidi control characters other than LRM or RLM.

* *

Note that option #UBIDI_OUTPUT_REVERSE of * ubidi_writeReordered has no useful meaning and should not be * used in conjunction with any value of the reordering mode specifying * "inverse Bidi" or with value UBIDI_REORDER_RUNS_ONLY. * * @param pBiDi is a UBiDi object. * @param reorderingMode specifies the required variant of the Bidi algorithm. * * @see UBiDiReorderingMode * @see ubidi_setInverse * @see ubidi_setPara * @see ubidi_writeReordered * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ubidi_setReorderingMode(UBiDi *pBiDi, UBiDiReorderingMode reorderingMode); /** * What is the requested reordering mode for a given Bidi object? * * @param pBiDi is a UBiDi object. * @return the current reordering mode of the Bidi object * @see ubidi_setReorderingMode * @stable ICU 3.6 */ U_CAPI UBiDiReorderingMode U_EXPORT2 ubidi_getReorderingMode(UBiDi *pBiDi); /** * UBiDiReorderingOption values indicate which options are * specified to affect the Bidi algorithm. * * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ typedef enum UBiDiReorderingOption { /** * option value for ubidi_setReorderingOptions: * disable all the options which can be set with this function * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ UBIDI_OPTION_DEFAULT = 0, /** * option bit for ubidi_setReorderingOptions: * insert Bidi marks (LRM or RLM) when needed to ensure correct result of * a reordering to a Logical order * *

This option must be set or reset before calling * ubidi_setPara.

* *

This option is significant only with reordering modes which generate * a result with Logical order, specifically:

*
    *
  • #UBIDI_REORDER_RUNS_ONLY
  • *
  • #UBIDI_REORDER_INVERSE_NUMBERS_AS_L
  • *
  • #UBIDI_REORDER_INVERSE_LIKE_DIRECT
  • *
  • #UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL
  • *
* *

If this option is set in conjunction with reordering mode * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L or with calling * ubidi_setInverse(true), it implies * option #UBIDI_INSERT_LRM_FOR_NUMERIC * in calls to function ubidi_writeReordered().

* *

For other reordering modes, a minimum number of LRM or RLM characters * will be added to the source text after reordering it so as to ensure * round trip, i.e. when applying the inverse reordering mode on the * resulting logical text with removal of Bidi marks * (option #UBIDI_OPTION_REMOVE_CONTROLS set before calling * ubidi_setPara() or option #UBIDI_REMOVE_BIDI_CONTROLS * in ubidi_writeReordered), the result will be identical to the * source text in the first transformation. * *

This option will be ignored if specified together with option * #UBIDI_OPTION_REMOVE_CONTROLS. It inhibits option * UBIDI_REMOVE_BIDI_CONTROLS in calls to function * ubidi_writeReordered() and it implies option * #UBIDI_INSERT_LRM_FOR_NUMERIC in calls to function * ubidi_writeReordered() if the reordering mode is * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L.

* * @see ubidi_setReorderingMode * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ UBIDI_OPTION_INSERT_MARKS = 1, /** * option bit for ubidi_setReorderingOptions: * remove Bidi control characters * *

This option must be set or reset before calling * ubidi_setPara.

* *

This option nullifies option #UBIDI_OPTION_INSERT_MARKS. * It inhibits option #UBIDI_INSERT_LRM_FOR_NUMERIC in calls * to function ubidi_writeReordered() and it implies option * #UBIDI_REMOVE_BIDI_CONTROLS in calls to that function.

* * @see ubidi_setReorderingMode * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ UBIDI_OPTION_REMOVE_CONTROLS = 2, /** * option bit for ubidi_setReorderingOptions: * process the output as part of a stream to be continued * *

This option must be set or reset before calling * ubidi_setPara.

* *

This option specifies that the caller is interested in processing large * text object in parts. * The results of the successive calls are expected to be concatenated by the * caller. Only the call for the last part will have this option bit off.

* *

When this option bit is on, ubidi_setPara() may process * less than the full source text in order to truncate the text at a meaningful * boundary. The caller should call ubidi_getProcessedLength() * immediately after calling ubidi_setPara() in order to * determine how much of the source text has been processed. * Source text beyond that length should be resubmitted in following calls to * ubidi_setPara. The processed length may be less than * the length of the source text if a character preceding the last character of * the source text constitutes a reasonable boundary (like a block separator) * for text to be continued.
* If the last character of the source text constitutes a reasonable * boundary, the whole text will be processed at once.
* If nowhere in the source text there exists * such a reasonable boundary, the processed length will be zero.
* The caller should check for such an occurrence and do one of the following: *

  • submit a larger amount of text with a better chance to include * a reasonable boundary.
  • *
  • resubmit the same text after turning off option * UBIDI_OPTION_STREAMING.
* In all cases, this option should be turned off before processing the last * part of the text.

* *

When the UBIDI_OPTION_STREAMING option is used, * it is recommended to call ubidi_orderParagraphsLTR() with * argument orderParagraphsLTR set to true before * calling ubidi_setPara so that later paragraphs may be * concatenated to previous paragraphs on the right.

* * @see ubidi_setReorderingMode * @see ubidi_setReorderingOptions * @see ubidi_getProcessedLength * @see ubidi_orderParagraphsLTR * @stable ICU 3.6 */ UBIDI_OPTION_STREAMING = 4 } UBiDiReorderingOption; /** * Specify which of the reordering options * should be applied during Bidi transformations. * * @param pBiDi is a UBiDi object. * @param reorderingOptions is a combination of zero or more of the following * options: * #UBIDI_OPTION_DEFAULT, #UBIDI_OPTION_INSERT_MARKS, * #UBIDI_OPTION_REMOVE_CONTROLS, #UBIDI_OPTION_STREAMING. * * @see ubidi_getReorderingOptions * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ubidi_setReorderingOptions(UBiDi *pBiDi, uint32_t reorderingOptions); /** * What are the reordering options applied to a given Bidi object? * * @param pBiDi is a UBiDi object. * @return the current reordering options of the Bidi object * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ U_CAPI uint32_t U_EXPORT2 ubidi_getReorderingOptions(UBiDi *pBiDi); /** * Set the context before a call to ubidi_setPara().

* * ubidi_setPara() computes the left-right directionality for a given piece * of text which is supplied as one of its arguments. Sometimes this piece * of text (the "main text") should be considered in context, because text * appearing before ("prologue") and/or after ("epilogue") the main text * may affect the result of this computation.

* * This function specifies the prologue and/or the epilogue for the next * call to ubidi_setPara(). The characters specified as prologue and * epilogue should not be modified by the calling program until the call * to ubidi_setPara() has returned. If successive calls to ubidi_setPara() * all need specification of a context, ubidi_setContext() must be called * before each call to ubidi_setPara(). In other words, a context is not * "remembered" after the following successful call to ubidi_setPara().

* * If a call to ubidi_setPara() specifies UBIDI_DEFAULT_LTR or * UBIDI_DEFAULT_RTL as paraLevel and is preceded by a call to * ubidi_setContext() which specifies a prologue, the paragraph level will * be computed taking in consideration the text in the prologue.

* * When ubidi_setPara() is called without a previous call to * ubidi_setContext, the main text is handled as if preceded and followed * by strong directional characters at the current paragraph level. * Calling ubidi_setContext() with specification of a prologue will change * this behavior by handling the main text as if preceded by the last * strong character appearing in the prologue, if any. * Calling ubidi_setContext() with specification of an epilogue will change * the behavior of ubidi_setPara() by handling the main text as if followed * by the first strong character or digit appearing in the epilogue, if any.

* * Note 1: if ubidi_setContext is called repeatedly without * calling ubidi_setPara, the earlier calls have no effect, * only the last call will be remembered for the next call to * ubidi_setPara.

* * Note 2: calling ubidi_setContext(pBiDi, NULL, 0, NULL, 0, &errorCode) * cancels any previous setting of non-empty prologue or epilogue. * The next call to ubidi_setPara() will process no * prologue or epilogue.

* * Note 3: users must be aware that even after setting the context * before a call to ubidi_setPara() to perform e.g. a logical to visual * transformation, the resulting string may not be identical to what it * would have been if all the text, including prologue and epilogue, had * been processed together.
* Example (upper case letters represent RTL characters):
*   prologue = "abc DE"
*   epilogue = none
*   main text = "FGH xyz"
*   paraLevel = UBIDI_LTR
*   display without prologue = "HGF xyz" * ("HGF" is adjacent to "xyz")
*   display with prologue = "abc HGFED xyz" * ("HGF" is not adjacent to "xyz")
* * @param pBiDi is a paragraph UBiDi object. * * @param prologue is a pointer to the text which precedes the text that * will be specified in a coming call to ubidi_setPara(). * If there is no prologue to consider, then proLength * must be zero and this pointer can be NULL. * * @param proLength is the length of the prologue; if proLength==-1 * then the prologue must be zero-terminated. * Otherwise proLength must be >= 0. If proLength==0, it means * that there is no prologue to consider. * * @param epilogue is a pointer to the text which follows the text that * will be specified in a coming call to ubidi_setPara(). * If there is no epilogue to consider, then epiLength * must be zero and this pointer can be NULL. * * @param epiLength is the length of the epilogue; if epiLength==-1 * then the epilogue must be zero-terminated. * Otherwise epiLength must be >= 0. If epiLength==0, it means * that there is no epilogue to consider. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_setPara * @stable ICU 4.8 */ U_CAPI void U_EXPORT2 ubidi_setContext(UBiDi *pBiDi, const UChar *prologue, int32_t proLength, const UChar *epilogue, int32_t epiLength, UErrorCode *pErrorCode); /** * Perform the Unicode Bidi algorithm. It is defined in the * Unicode Standard Annex #9, * version 13, * also described in The Unicode Standard, Version 4.0 .

* * This function takes a piece of plain text containing one or more paragraphs, * with or without externally specified embedding levels from styled * text and computes the left-right-directionality of each character.

* * If the entire text is all of the same directionality, then * the function may not perform all the steps described by the algorithm, * i.e., some levels may not be the same as if all steps were performed. * This is not relevant for unidirectional text.
* For example, in pure LTR text with numbers the numbers would get * a resolved level of 2 higher than the surrounding text according to * the algorithm. This implementation may set all resolved levels to * the same value in such a case.

* * The text can be composed of multiple paragraphs. Occurrence of a block * separator in the text terminates a paragraph, and whatever comes next starts * a new paragraph. The exception to this rule is when a Carriage Return (CR) * is followed by a Line Feed (LF). Both CR and LF are block separators, but * in that case, the pair of characters is considered as terminating the * preceding paragraph, and a new paragraph will be started by a character * coming after the LF. * * @param pBiDi A UBiDi object allocated with ubidi_open() * which will be set to contain the reordering information, * especially the resolved levels for all the characters in text. * * @param text is a pointer to the text that the Bidi algorithm will be performed on. * This pointer is stored in the UBiDi object and can be retrieved * with ubidi_getText().
* Note: the text must be (at least) length long. * * @param length is the length of the text; if length==-1 then * the text must be zero-terminated. * * @param paraLevel specifies the default level for the text; * it is typically 0 (LTR) or 1 (RTL). * If the function shall determine the paragraph level from the text, * then paraLevel can be set to * either #UBIDI_DEFAULT_LTR * or #UBIDI_DEFAULT_RTL; if the text contains multiple * paragraphs, the paragraph level shall be determined separately for * each paragraph; if a paragraph does not include any strongly typed * character, then the desired default is used (0 for LTR or 1 for RTL). * Any other value between 0 and #UBIDI_MAX_EXPLICIT_LEVEL * is also valid, with odd levels indicating RTL. * * @param embeddingLevels (in) may be used to preset the embedding and override levels, * ignoring characters like LRE and PDF in the text. * A level overrides the directional property of its corresponding * (same index) character if the level has the * #UBIDI_LEVEL_OVERRIDE bit set.

* Aside from that bit, it must be * paraLevel<=embeddingLevels[]<=UBIDI_MAX_EXPLICIT_LEVEL, * except that level 0 is always allowed. * Level 0 for a paragraph separator prevents reordering of paragraphs; * this only works reliably if #UBIDI_LEVEL_OVERRIDE * is also set for paragraph separators. * Level 0 for other characters is treated as a wildcard * and is lifted up to the resolved level of the surrounding paragraph.

* Caution: A copy of this pointer, not of the levels, * will be stored in the UBiDi object; * the embeddingLevels array must not be * deallocated before the UBiDi structure is destroyed or reused, * and the embeddingLevels * should not be modified to avoid unexpected results on subsequent Bidi operations. * However, the ubidi_setPara() and * ubidi_setLine() functions may modify some or all of the levels.

* After the UBiDi object is reused or destroyed, the caller * must take care of the deallocation of the embeddingLevels array.

* Note: the embeddingLevels array must be * at least length long. * This pointer can be NULL if this * value is not necessary. * * @param pErrorCode must be a valid pointer to an error code value. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, UBiDiLevel paraLevel, UBiDiLevel *embeddingLevels, UErrorCode *pErrorCode); /** * ubidi_setLine() sets a UBiDi to * contain the reordering information, especially the resolved levels, * for all the characters in a line of text. This line of text is * specified by referring to a UBiDi object representing * this information for a piece of text containing one or more paragraphs, * and by specifying a range of indexes in this text.

* In the new line object, the indexes will range from 0 to limit-start-1.

* * This is used after calling ubidi_setPara() * for a piece of text, and after line-breaking on that text. * It is not necessary if each paragraph is treated as a single line.

* * After line-breaking, rules (L1) and (L2) for the treatment of * trailing WS and for reordering are performed on * a UBiDi object that represents a line.

* * Important: pLineBiDi shares data with * pParaBiDi. * You must destroy or reuse pLineBiDi before pParaBiDi. * In other words, you must destroy or reuse the UBiDi object for a line * before the object for its parent paragraph.

* * The text pointer that was stored in pParaBiDi is also copied, * and start is added to it so that it points to the beginning of the * line for this object. * * @param pParaBiDi is the parent paragraph object. It must have been set * by a successful call to ubidi_setPara. * * @param start is the line's first index into the text. * * @param limit is just behind the line's last index into the text * (its last index +1).
* It must be 0<=startcontaining paragraph limit. * If the specified line crosses a paragraph boundary, the function * will terminate with error code U_ILLEGAL_ARGUMENT_ERROR. * * @param pLineBiDi is the object that will now represent a line of the text. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_setPara * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_setLine(const UBiDi *pParaBiDi, int32_t start, int32_t limit, UBiDi *pLineBiDi, UErrorCode *pErrorCode); /** * Get the directionality of the text. * * @param pBiDi is the paragraph or line UBiDi object. * * @return a value of UBIDI_LTR, UBIDI_RTL * or UBIDI_MIXED * that indicates if the entire text * represented by this object is unidirectional, * and which direction, or if it is mixed-directional. * Note - The value UBIDI_NEUTRAL is never returned from this method. * * @see UBiDiDirection * @stable ICU 2.0 */ U_CAPI UBiDiDirection U_EXPORT2 ubidi_getDirection(const UBiDi *pBiDi); /** * Gets the base direction of the text provided according * to the Unicode Bidirectional Algorithm. The base direction * is derived from the first character in the string with bidirectional * character type L, R, or AL. If the first such character has type L, * UBIDI_LTR is returned. If the first such character has * type R or AL, UBIDI_RTL is returned. If the string does * not contain any character of these types, then * UBIDI_NEUTRAL is returned. * * This is a lightweight function for use when only the base direction * is needed and no further bidi processing of the text is needed. * * @param text is a pointer to the text whose base * direction is needed. * Note: the text must be (at least) @c length long. * * @param length is the length of the text; * if length==-1 then the text * must be zero-terminated. * * @return UBIDI_LTR, UBIDI_RTL, * UBIDI_NEUTRAL * * @see UBiDiDirection * @stable ICU 4.6 */ U_CAPI UBiDiDirection U_EXPORT2 ubidi_getBaseDirection(const UChar *text, int32_t length ); /** * Get the pointer to the text. * * @param pBiDi is the paragraph or line UBiDi object. * * @return The pointer to the text that the UBiDi object was created for. * * @see ubidi_setPara * @see ubidi_setLine * @stable ICU 2.0 */ U_CAPI const UChar * U_EXPORT2 ubidi_getText(const UBiDi *pBiDi); /** * Get the length of the text. * * @param pBiDi is the paragraph or line UBiDi object. * * @return The length of the text that the UBiDi object was created for. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubidi_getLength(const UBiDi *pBiDi); /** * Get the paragraph level of the text. * * @param pBiDi is the paragraph or line UBiDi object. * * @return The paragraph level. If there are multiple paragraphs, their * level may vary if the required paraLevel is UBIDI_DEFAULT_LTR or * UBIDI_DEFAULT_RTL. In that case, the level of the first paragraph * is returned. * * @see UBiDiLevel * @see ubidi_getParagraph * @see ubidi_getParagraphByIndex * @stable ICU 2.0 */ U_CAPI UBiDiLevel U_EXPORT2 ubidi_getParaLevel(const UBiDi *pBiDi); /** * Get the number of paragraphs. * * @param pBiDi is the paragraph or line UBiDi object. * * @return The number of paragraphs. * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 ubidi_countParagraphs(UBiDi *pBiDi); /** * Get a paragraph, given a position within the text. * This function returns information about a paragraph.
* Note: if the paragraph index is known, it is more efficient to * retrieve the paragraph information using ubidi_getParagraphByIndex().

* * @param pBiDi is the paragraph or line UBiDi object. * * @param charIndex is the index of a character within the text, in the * range [0..ubidi_getProcessedLength(pBiDi)-1]. * * @param pParaStart will receive the index of the first character of the * paragraph in the text. * This pointer can be NULL if this * value is not necessary. * * @param pParaLimit will receive the limit of the paragraph. * The l-value that you point to here may be the * same expression (variable) as the one for * charIndex. * This pointer can be NULL if this * value is not necessary. * * @param pParaLevel will receive the level of the paragraph. * This pointer can be NULL if this * value is not necessary. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The index of the paragraph containing the specified position. * * @see ubidi_getProcessedLength * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 ubidi_getParagraph(const UBiDi *pBiDi, int32_t charIndex, int32_t *pParaStart, int32_t *pParaLimit, UBiDiLevel *pParaLevel, UErrorCode *pErrorCode); /** * Get a paragraph, given the index of this paragraph. * * This function returns information about a paragraph.

* * @param pBiDi is the paragraph UBiDi object. * * @param paraIndex is the number of the paragraph, in the * range [0..ubidi_countParagraphs(pBiDi)-1]. * * @param pParaStart will receive the index of the first character of the * paragraph in the text. * This pointer can be NULL if this * value is not necessary. * * @param pParaLimit will receive the limit of the paragraph. * This pointer can be NULL if this * value is not necessary. * * @param pParaLevel will receive the level of the paragraph. * This pointer can be NULL if this * value is not necessary. * * @param pErrorCode must be a valid pointer to an error code value. * * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ubidi_getParagraphByIndex(const UBiDi *pBiDi, int32_t paraIndex, int32_t *pParaStart, int32_t *pParaLimit, UBiDiLevel *pParaLevel, UErrorCode *pErrorCode); /** * Get the level for one character. * * @param pBiDi is the paragraph or line UBiDi object. * * @param charIndex the index of a character. It must be in the range * [0..ubidi_getProcessedLength(pBiDi)]. * * @return The level for the character at charIndex (0 if charIndex is not * in the valid range). * * @see UBiDiLevel * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_CAPI UBiDiLevel U_EXPORT2 ubidi_getLevelAt(const UBiDi *pBiDi, int32_t charIndex); /** * Get an array of levels for each character.

* * Note that this function may allocate memory under some * circumstances, unlike ubidi_getLevelAt(). * * @param pBiDi is the paragraph or line UBiDi object, whose * text length must be strictly positive. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The levels array for the text, * or NULL if an error occurs. * * @see UBiDiLevel * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_CAPI const UBiDiLevel * U_EXPORT2 ubidi_getLevels(UBiDi *pBiDi, UErrorCode *pErrorCode); /** * Get a logical run. * This function returns information about a run and is used * to retrieve runs in logical order.

* This is especially useful for line-breaking on a paragraph. * * @param pBiDi is the paragraph or line UBiDi object. * * @param logicalPosition is a logical position within the source text. * * @param pLogicalLimit will receive the limit of the corresponding run. * The l-value that you point to here may be the * same expression (variable) as the one for * logicalPosition. * This pointer can be NULL if this * value is not necessary. * * @param pLevel will receive the level of the corresponding run. * This pointer can be NULL if this * value is not necessary. * * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_getLogicalRun(const UBiDi *pBiDi, int32_t logicalPosition, int32_t *pLogicalLimit, UBiDiLevel *pLevel); /** * Get the number of runs. * This function may invoke the actual reordering on the * UBiDi object, after ubidi_setPara() * may have resolved only the levels of the text. Therefore, * ubidi_countRuns() may have to allocate memory, * and may fail doing so. * * @param pBiDi is the paragraph or line UBiDi object. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The number of runs. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubidi_countRuns(UBiDi *pBiDi, UErrorCode *pErrorCode); /** * Get one run's logical start, length, and directionality, * which can be 0 for LTR or 1 for RTL. * In an RTL run, the character at the logical start is * visually on the right of the displayed run. * The length is the number of characters in the run.

* ubidi_countRuns() should be called * before the runs are retrieved. * * @param pBiDi is the paragraph or line UBiDi object. * * @param runIndex is the number of the run in visual order, in the * range [0..ubidi_countRuns(pBiDi)-1]. * * @param pLogicalStart is the first logical character index in the text. * The pointer may be NULL if this index is not needed. * * @param pLength is the number of characters (at least one) in the run. * The pointer may be NULL if this is not needed. * * @return the directionality of the run, * UBIDI_LTR==0 or UBIDI_RTL==1, * never UBIDI_MIXED, * never UBIDI_NEUTRAL. * * @see ubidi_countRuns * * Example: *

 * \code
 * int32_t i, count=ubidi_countRuns(pBiDi),
 *         logicalStart, visualIndex=0, length;
 * for(i=0; i0);
 *     } else {
 *         logicalStart+=length;  // logicalLimit
 *         do { // RTL
 *             show_char(text[--logicalStart], visualIndex++);
 *         } while(--length>0);
 *     }
 * }
 *\endcode
 * 
* * Note that in right-to-left runs, code like this places * second surrogates before first ones (which is generally a bad idea) * and combining characters before base characters. *

* Use of ubidi_writeReordered(), optionally with the * #UBIDI_KEEP_BASE_COMBINING option, can be considered in order * to avoid these issues. * @stable ICU 2.0 */ U_CAPI UBiDiDirection U_EXPORT2 ubidi_getVisualRun(UBiDi *pBiDi, int32_t runIndex, int32_t *pLogicalStart, int32_t *pLength); /** * Get the visual position from a logical text position. * If such a mapping is used many times on the same * UBiDi object, then calling * ubidi_getLogicalMap() is more efficient.

* * The value returned may be #UBIDI_MAP_NOWHERE if there is no * visual position because the corresponding text character is a Bidi control * removed from output by the option #UBIDI_OPTION_REMOVE_CONTROLS. *

* When the visual output is altered by using options of * ubidi_writeReordered() such as UBIDI_INSERT_LRM_FOR_NUMERIC, * UBIDI_KEEP_BASE_COMBINING, UBIDI_OUTPUT_REVERSE, * UBIDI_REMOVE_BIDI_CONTROLS, the visual position returned may not * be correct. It is advised to use, when possible, reordering options * such as UBIDI_OPTION_INSERT_MARKS and UBIDI_OPTION_REMOVE_CONTROLS. *

* Note that in right-to-left runs, this mapping places * second surrogates before first ones (which is generally a bad idea) * and combining characters before base characters. * Use of ubidi_writeReordered(), optionally with the * #UBIDI_KEEP_BASE_COMBINING option can be considered instead * of using the mapping, in order to avoid these issues. * * @param pBiDi is the paragraph or line UBiDi object. * * @param logicalIndex is the index of a character in the text. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The visual position of this character. * * @see ubidi_getLogicalMap * @see ubidi_getLogicalIndex * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubidi_getVisualIndex(UBiDi *pBiDi, int32_t logicalIndex, UErrorCode *pErrorCode); /** * Get the logical text position from a visual position. * If such a mapping is used many times on the same * UBiDi object, then calling * ubidi_getVisualMap() is more efficient.

* * The value returned may be #UBIDI_MAP_NOWHERE if there is no * logical position because the corresponding text character is a Bidi mark * inserted in the output by option #UBIDI_OPTION_INSERT_MARKS. *

* This is the inverse function to ubidi_getVisualIndex(). *

* When the visual output is altered by using options of * ubidi_writeReordered() such as UBIDI_INSERT_LRM_FOR_NUMERIC, * UBIDI_KEEP_BASE_COMBINING, UBIDI_OUTPUT_REVERSE, * UBIDI_REMOVE_BIDI_CONTROLS, the logical position returned may not * be correct. It is advised to use, when possible, reordering options * such as UBIDI_OPTION_INSERT_MARKS and UBIDI_OPTION_REMOVE_CONTROLS. * * @param pBiDi is the paragraph or line UBiDi object. * * @param visualIndex is the visual position of a character. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The index of this character in the text. * * @see ubidi_getVisualMap * @see ubidi_getVisualIndex * @see ubidi_getResultLength * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubidi_getLogicalIndex(UBiDi *pBiDi, int32_t visualIndex, UErrorCode *pErrorCode); /** * Get a logical-to-visual index map (array) for the characters in the UBiDi * (paragraph or line) object. *

* Some values in the map may be #UBIDI_MAP_NOWHERE if the * corresponding text characters are Bidi controls removed from the visual * output by the option #UBIDI_OPTION_REMOVE_CONTROLS. *

* When the visual output is altered by using options of * ubidi_writeReordered() such as UBIDI_INSERT_LRM_FOR_NUMERIC, * UBIDI_KEEP_BASE_COMBINING, UBIDI_OUTPUT_REVERSE, * UBIDI_REMOVE_BIDI_CONTROLS, the visual positions returned may not * be correct. It is advised to use, when possible, reordering options * such as UBIDI_OPTION_INSERT_MARKS and UBIDI_OPTION_REMOVE_CONTROLS. *

* Note that in right-to-left runs, this mapping places * second surrogates before first ones (which is generally a bad idea) * and combining characters before base characters. * Use of ubidi_writeReordered(), optionally with the * #UBIDI_KEEP_BASE_COMBINING option can be considered instead * of using the mapping, in order to avoid these issues. * * @param pBiDi is the paragraph or line UBiDi object. * * @param indexMap is a pointer to an array of ubidi_getProcessedLength() * indexes which will reflect the reordering of the characters. * If option #UBIDI_OPTION_INSERT_MARKS is set, the number * of elements allocated in indexMap must be no less than * ubidi_getResultLength(). * The array does not need to be initialized.

* The index map will result in indexMap[logicalIndex]==visualIndex. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_getVisualMap * @see ubidi_getVisualIndex * @see ubidi_getProcessedLength * @see ubidi_getResultLength * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_getLogicalMap(UBiDi *pBiDi, int32_t *indexMap, UErrorCode *pErrorCode); /** * Get a visual-to-logical index map (array) for the characters in the UBiDi * (paragraph or line) object. *

* Some values in the map may be #UBIDI_MAP_NOWHERE if the * corresponding text characters are Bidi marks inserted in the visual output * by the option #UBIDI_OPTION_INSERT_MARKS. *

* When the visual output is altered by using options of * ubidi_writeReordered() such as UBIDI_INSERT_LRM_FOR_NUMERIC, * UBIDI_KEEP_BASE_COMBINING, UBIDI_OUTPUT_REVERSE, * UBIDI_REMOVE_BIDI_CONTROLS, the logical positions returned may not * be correct. It is advised to use, when possible, reordering options * such as UBIDI_OPTION_INSERT_MARKS and UBIDI_OPTION_REMOVE_CONTROLS. * * @param pBiDi is the paragraph or line UBiDi object. * * @param indexMap is a pointer to an array of ubidi_getResultLength() * indexes which will reflect the reordering of the characters. * If option #UBIDI_OPTION_REMOVE_CONTROLS is set, the number * of elements allocated in indexMap must be no less than * ubidi_getProcessedLength(). * The array does not need to be initialized.

* The index map will result in indexMap[visualIndex]==logicalIndex. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_getLogicalMap * @see ubidi_getLogicalIndex * @see ubidi_getProcessedLength * @see ubidi_getResultLength * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_getVisualMap(UBiDi *pBiDi, int32_t *indexMap, UErrorCode *pErrorCode); /** * This is a convenience function that does not use a UBiDi object. * It is intended to be used for when an application has determined the levels * of objects (character sequences) and just needs to have them reordered (L2). * This is equivalent to using ubidi_getLogicalMap() on a * UBiDi object. * * @param levels is an array with length levels that have been determined by * the application. * * @param length is the number of levels in the array, or, semantically, * the number of objects to be reordered. * It must be length>0. * * @param indexMap is a pointer to an array of length * indexes which will reflect the reordering of the characters. * The array does not need to be initialized.

* The index map will result in indexMap[logicalIndex]==visualIndex. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_reorderLogical(const UBiDiLevel *levels, int32_t length, int32_t *indexMap); /** * This is a convenience function that does not use a UBiDi object. * It is intended to be used for when an application has determined the levels * of objects (character sequences) and just needs to have them reordered (L2). * This is equivalent to using ubidi_getVisualMap() on a * UBiDi object. * * @param levels is an array with length levels that have been determined by * the application. * * @param length is the number of levels in the array, or, semantically, * the number of objects to be reordered. * It must be length>0. * * @param indexMap is a pointer to an array of length * indexes which will reflect the reordering of the characters. * The array does not need to be initialized.

* The index map will result in indexMap[visualIndex]==logicalIndex. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_reorderVisual(const UBiDiLevel *levels, int32_t length, int32_t *indexMap); /** * Invert an index map. * The index mapping of the first map is inverted and written to * the second one. * * @param srcMap is an array with length elements * which defines the original mapping from a source array containing * length elements to a destination array. * Some elements of the source array may have no mapping in the * destination array. In that case, their value will be * the special value UBIDI_MAP_NOWHERE. * All elements must be >=0 or equal to UBIDI_MAP_NOWHERE. * Some elements may have a value >= length, if the * destination array has more elements than the source array. * There must be no duplicate indexes (two or more elements with the * same value except UBIDI_MAP_NOWHERE). * * @param destMap is an array with a number of elements equal to 1 + the highest * value in srcMap. * destMap will be filled with the inverse mapping. * If element with index i in srcMap has a value k different * from UBIDI_MAP_NOWHERE, this means that element i of * the source array maps to element k in the destination array. * The inverse map will have value i in its k-th element. * For all elements of the destination array which do not map to * an element in the source array, the corresponding element in the * inverse map will have a value equal to UBIDI_MAP_NOWHERE. * * @param length is the length of each array. * @see UBIDI_MAP_NOWHERE * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubidi_invertMap(const int32_t *srcMap, int32_t *destMap, int32_t length); /** option flags for ubidi_writeReordered() */ /** * option bit for ubidi_writeReordered(): * keep combining characters after their base characters in RTL runs * * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_KEEP_BASE_COMBINING 1 /** * option bit for ubidi_writeReordered(): * replace characters with the "mirrored" property in RTL runs * by their mirror-image mappings * * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_DO_MIRRORING 2 /** * option bit for ubidi_writeReordered(): * surround the run with LRMs if necessary; * this is part of the approximate "inverse Bidi" algorithm * *

This option does not imply corresponding adjustment of the index * mappings.

* * @see ubidi_setInverse * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_INSERT_LRM_FOR_NUMERIC 4 /** * option bit for ubidi_writeReordered(): * remove Bidi control characters * (this does not affect #UBIDI_INSERT_LRM_FOR_NUMERIC) * *

This option does not imply corresponding adjustment of the index * mappings.

* * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_REMOVE_BIDI_CONTROLS 8 /** * option bit for ubidi_writeReordered(): * write the output in reverse order * *

This has the same effect as calling ubidi_writeReordered() * first without this option, and then calling * ubidi_writeReverse() without mirroring. * Doing this in the same step is faster and avoids a temporary buffer. * An example for using this option is output to a character terminal that * is designed for RTL scripts and stores text in reverse order.

* * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_OUTPUT_REVERSE 16 /** * Get the length of the source text processed by the last call to * ubidi_setPara(). This length may be different from the length * of the source text if option #UBIDI_OPTION_STREAMING * has been set. *
* Note that whenever the length of the text affects the execution or the * result of a function, it is the processed length which must be considered, * except for ubidi_setPara (which receives unprocessed source * text) and ubidi_getLength (which returns the original length * of the source text).
* In particular, the processed length is the one to consider in the following * cases: *
    *
  • maximum value of the limit argument of * ubidi_setLine
  • *
  • maximum value of the charIndex argument of * ubidi_getParagraph
  • *
  • maximum value of the charIndex argument of * ubidi_getLevelAt
  • *
  • number of elements in the array returned by ubidi_getLevels
  • *
  • maximum value of the logicalStart argument of * ubidi_getLogicalRun
  • *
  • maximum value of the logicalIndex argument of * ubidi_getVisualIndex
  • *
  • number of elements filled in the *indexMap argument of * ubidi_getLogicalMap
  • *
  • length of text processed by ubidi_writeReordered
  • *
* * @param pBiDi is the paragraph UBiDi object. * * @return The length of the part of the source text processed by * the last call to ubidi_setPara. * @see ubidi_setPara * @see UBIDI_OPTION_STREAMING * @stable ICU 3.6 */ U_CAPI int32_t U_EXPORT2 ubidi_getProcessedLength(const UBiDi *pBiDi); /** * Get the length of the reordered text resulting from the last call to * ubidi_setPara(). This length may be different from the length * of the source text if option #UBIDI_OPTION_INSERT_MARKS * or option #UBIDI_OPTION_REMOVE_CONTROLS has been set. *
* This resulting length is the one to consider in the following cases: *
    *
  • maximum value of the visualIndex argument of * ubidi_getLogicalIndex
  • *
  • number of elements of the *indexMap argument of * ubidi_getVisualMap
  • *
* Note that this length stays identical to the source text length if * Bidi marks are inserted or removed using option bits of * ubidi_writeReordered, or if option * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L has been set. * * @param pBiDi is the paragraph UBiDi object. * * @return The length of the reordered text resulting from * the last call to ubidi_setPara. * @see ubidi_setPara * @see UBIDI_OPTION_INSERT_MARKS * @see UBIDI_OPTION_REMOVE_CONTROLS * @stable ICU 3.6 */ U_CAPI int32_t U_EXPORT2 ubidi_getResultLength(const UBiDi *pBiDi); U_CDECL_BEGIN /** * Callback type declaration for overriding default Bidi class values with * custom ones. *

Usually, the function pointer will be propagated to a UBiDi * object by calling the ubidi_setClassCallback() function; * then the callback will be invoked by the UBA implementation any time the * class of a character is to be determined.

* * @param context is a pointer to the callback private data. * * @param c is the code point to get a Bidi class for. * * @return The directional property / Bidi class for the given code point * c if the default class has been overridden, or * u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)+1 * if the standard Bidi class value for c is to be used. * @see ubidi_setClassCallback * @see ubidi_getClassCallback * @stable ICU 3.6 */ typedef UCharDirection U_CALLCONV UBiDiClassCallback(const void *context, UChar32 c); U_CDECL_END /** * Retrieve the Bidi class for a given code point. *

If a #UBiDiClassCallback callback is defined and returns a * value other than u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)+1, * that value is used; otherwise the default class determination mechanism is invoked.

* * @param pBiDi is the paragraph UBiDi object. * * @param c is the code point whose Bidi class must be retrieved. * * @return The Bidi class for character c based * on the given pBiDi instance. * @see UBiDiClassCallback * @stable ICU 3.6 */ U_CAPI UCharDirection U_EXPORT2 ubidi_getCustomizedClass(UBiDi *pBiDi, UChar32 c); /** * Set the callback function and callback data used by the UBA * implementation for Bidi class determination. *

This may be useful for assigning Bidi classes to PUA characters, or * for special application needs. For instance, an application may want to * handle all spaces like L or R characters (according to the base direction) * when creating the visual ordering of logical lines which are part of a report * organized in columns: there should not be interaction between adjacent * cells.

* * @param pBiDi is the paragraph UBiDi object. * * @param newFn is the new callback function pointer. * * @param newContext is the new callback context pointer. This can be NULL. * * @param oldFn fillin: Returns the old callback function pointer. This can be * NULL. * * @param oldContext fillin: Returns the old callback's context. This can be * NULL. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_getClassCallback * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ubidi_setClassCallback(UBiDi *pBiDi, UBiDiClassCallback *newFn, const void *newContext, UBiDiClassCallback **oldFn, const void **oldContext, UErrorCode *pErrorCode); /** * Get the current callback function used for Bidi class determination. * * @param pBiDi is the paragraph UBiDi object. * * @param fn fillin: Returns the callback function pointer. * * @param context fillin: Returns the callback's private context. * * @see ubidi_setClassCallback * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ubidi_getClassCallback(UBiDi *pBiDi, UBiDiClassCallback **fn, const void **context); /** * Take a UBiDi object containing the reordering * information for a piece of text (one or more paragraphs) set by * ubidi_setPara() or for a line of text set by * ubidi_setLine() and write a reordered string to the * destination buffer. * * This function preserves the integrity of characters with multiple * code units and (optionally) combining characters. * Characters in RTL runs can be replaced by mirror-image characters * in the destination buffer. Note that "real" mirroring has * to be done in a rendering engine by glyph selection * and that for many "mirrored" characters there are no * Unicode characters as mirror-image equivalents. * There are also options to insert or remove Bidi control * characters; see the description of the destSize * and options parameters and of the option bit flags. * * @param pBiDi A pointer to a UBiDi object that * is set by ubidi_setPara() or * ubidi_setLine() and contains the reordering * information for the text that it was defined for, * as well as a pointer to that text.

* The text was aliased (only the pointer was stored * without copying the contents) and must not have been modified * since the ubidi_setPara() call. * * @param dest A pointer to where the reordered text is to be copied. * The source text and dest[destSize] * must not overlap. * * @param destSize The size of the dest buffer, * in number of UChars. * If the UBIDI_INSERT_LRM_FOR_NUMERIC * option is set, then the destination length could be * as large as * ubidi_getLength(pBiDi)+2*ubidi_countRuns(pBiDi). * If the UBIDI_REMOVE_BIDI_CONTROLS option * is set, then the destination length may be less than * ubidi_getLength(pBiDi). * If none of these options is set, then the destination length * will be exactly ubidi_getProcessedLength(pBiDi). * * @param options A bit set of options for the reordering that control * how the reordered text is written. * The options include mirroring the characters on a code * point basis and inserting LRM characters, which is used * especially for transforming visually stored text * to logically stored text (although this is still an * imperfect implementation of an "inverse Bidi" algorithm * because it uses the "forward Bidi" algorithm at its core). * The available options are: * #UBIDI_DO_MIRRORING, * #UBIDI_INSERT_LRM_FOR_NUMERIC, * #UBIDI_KEEP_BASE_COMBINING, * #UBIDI_OUTPUT_REVERSE, * #UBIDI_REMOVE_BIDI_CONTROLS * * @param pErrorCode must be a valid pointer to an error code value. * * @return The length of the output string. * * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubidi_writeReordered(UBiDi *pBiDi, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode); /** * Reverse a Right-To-Left run of Unicode text. * * This function preserves the integrity of characters with multiple * code units and (optionally) combining characters. * Characters can be replaced by mirror-image characters * in the destination buffer. Note that "real" mirroring has * to be done in a rendering engine by glyph selection * and that for many "mirrored" characters there are no * Unicode characters as mirror-image equivalents. * There are also options to insert or remove Bidi control * characters. * * This function is the implementation for reversing RTL runs as part * of ubidi_writeReordered(). For detailed descriptions * of the parameters, see there. * Since no Bidi controls are inserted here, the output string length * will never exceed srcLength. * * @see ubidi_writeReordered * * @param src A pointer to the RTL run text. * * @param srcLength The length of the RTL run. * * @param dest A pointer to where the reordered text is to be copied. * src[srcLength] and dest[destSize] * must not overlap. * * @param destSize The size of the dest buffer, * in number of UChars. * If the UBIDI_REMOVE_BIDI_CONTROLS option * is set, then the destination length may be less than * srcLength. * If this option is not set, then the destination length * will be exactly srcLength. * * @param options A bit set of options for the reordering that control * how the reordered text is written. * See the options parameter in ubidi_writeReordered(). * * @param pErrorCode must be a valid pointer to an error code value. * * @return The length of the output string. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubidi_writeReverse(const UChar *src, int32_t srcLength, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode); /*#define BIDI_SAMPLE_CODE*/ /*@}*/ #endif // ubiditransform.h /* ****************************************************************************** * * Copyright (C) 2016 and later: Unicode, Inc. and others. * License & terms of use: http://www.unicode.org/copyright.html * ****************************************************************************** * file name: ubiditransform.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2016jul24 * created by: Lina Kemmel * */ #ifndef UBIDITRANSFORM_H #define UBIDITRANSFORM_H /** * \file * \brief Bidi Transformations */ /** * `UBiDiOrder` indicates the order of text. * * This bidi transformation engine supports all possible combinations (4 in * total) of input and output text order: * * - : unless the output direction is RTL, this * corresponds to a normal operation of the Bidi algorithm as described in the * Unicode Technical Report and implemented by `UBiDi` when the * reordering mode is set to `UBIDI_REORDER_DEFAULT`. Visual RTL * mode is not supported by `UBiDi` and is accomplished through * reversing a visual LTR string, * * - : unless the input direction is RTL, this * corresponds to an "inverse bidi algorithm" in `UBiDi` with the * reordering mode set to `UBIDI_REORDER_INVERSE_LIKE_DIRECT`. * Visual RTL mode is not not supported by `UBiDi` and is * accomplished through reversing a visual LTR string, * * - : if the input and output base directions * mismatch, this corresponds to the `UBiDi` implementation with the * reordering mode set to `UBIDI_REORDER_RUNS_ONLY`; and if the * input and output base directions are identical, the transformation engine * will only handle character mirroring and Arabic shaping operations without * reordering, * * - : this reordering mode is not supported by * the `UBiDi` engine; it implies character mirroring, Arabic * shaping, and - if the input/output base directions mismatch - string * reverse operations. * @see ubidi_setInverse * @see ubidi_setReorderingMode * @see UBIDI_REORDER_DEFAULT * @see UBIDI_REORDER_INVERSE_LIKE_DIRECT * @see UBIDI_REORDER_RUNS_ONLY * @stable ICU 58 */ typedef enum { /** 0: Constant indicating a logical order. * This is the default for input text. * @stable ICU 58 */ UBIDI_LOGICAL = 0, /** 1: Constant indicating a visual order. * This is a default for output text. * @stable ICU 58 */ UBIDI_VISUAL } UBiDiOrder; /** * UBiDiMirroring indicates whether or not characters with the * "mirrored" property in RTL runs should be replaced with their mirror-image * counterparts. * @see UBIDI_DO_MIRRORING * @see ubidi_setReorderingOptions * @see ubidi_writeReordered * @see ubidi_writeReverse * @stable ICU 58 */ typedef enum { /** 0: Constant indicating that character mirroring should not be * performed. * This is the default. * @stable ICU 58 */ UBIDI_MIRRORING_OFF = 0, /** 1: Constant indicating that character mirroring should be performed. * This corresponds to calling ubidi_writeReordered or * ubidi_writeReverse with the * UBIDI_DO_MIRRORING option bit set. * @stable ICU 58 */ UBIDI_MIRRORING_ON } UBiDiMirroring; /** * Forward declaration of the UBiDiTransform structure that stores * information used by the layout transformation engine. * @stable ICU 58 */ typedef struct UBiDiTransform UBiDiTransform; #if (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Performs transformation of text from the bidi layout defined by the input * ordering scheme to the bidi layout defined by the output ordering scheme, * and applies character mirroring and Arabic shaping operations.

* In terms of UBiDi, such a transformation implies: *

    *
  • calling ubidi_setReorderingMode as needed (when the * reordering mode is other than normal),
  • *
  • calling ubidi_setInverse as needed (when text should be * transformed from a visual to a logical form),
  • *
  • resolving embedding levels of each character in the input text by * calling ubidi_setPara,
  • *
  • reordering the characters based on the computed embedding levels, also * performing character mirroring as needed, and streaming the result to the * output, by calling ubidi_writeReordered,
  • *
  • performing Arabic digit and letter shaping on the output text by calling * u_shapeArabic.
  • *
* An "ordering scheme" encompasses the base direction and the order of text, * and these characteristics must be defined by the caller for both input and * output explicitly .

* There are 36 possible combinations of ordering schemes, * which are partially supported by UBiDi already. Examples of the * currently supported combinations: *

    *
  • : this is equivalent to calling * ubidi_setPara with paraLevel == UBIDI_LTR,
  • *
  • : this is equivalent to calling * ubidi_setPara with paraLevel == UBIDI_RTL,
  • *
  • : this is equivalent to * calling ubidi_setPara with * paraLevel == UBIDI_DEFAULT_LTR,
  • *
  • : this is equivalent to * calling ubidi_setPara with * paraLevel == UBIDI_DEFAULT_RTL,
  • *
  • : this is equivalent to * calling ubidi_setInverse(UBiDi*, true) and then * ubidi_setPara with paraLevel == UBIDI_LTR,
  • *
  • : this is equivalent to * calling ubidi_setInverse(UBiDi*, true) and then * ubidi_setPara with paraLevel == UBIDI_RTL.
  • *
* All combinations that involve the Visual RTL scheme are unsupported by * UBiDi, for instance: *
    *
  • ,
  • *
  • .
  • *
*

Example of usage of the transformation engine:
*

 * \code
 * UChar text1[] = {'a', 'b', 'c', 0x0625, '1', 0};
 * UChar text2[] = {'a', 'b', 'c', 0x0625, '1', 0};
 * UErrorCode errorCode = U_ZERO_ERROR;
 * // Run a transformation.
 * ubiditransform_transform(pBidiTransform,
 *          text1, -1, text2, -1,
 *          UBIDI_LTR, UBIDI_VISUAL,
 *          UBIDI_RTL, UBIDI_LOGICAL,
 *          UBIDI_MIRRORING_OFF,
 *          U_SHAPE_DIGITS_AN2EN | U_SHAPE_DIGIT_TYPE_AN_EXTENDED,
 *          &errorCode);
 * // Do something with text2.
 *  text2[4] = '2';
 * // Run a reverse transformation.
 * ubiditransform_transform(pBidiTransform,
 *          text2, -1, text1, -1,
 *          UBIDI_RTL, UBIDI_LOGICAL,
 *          UBIDI_LTR, UBIDI_VISUAL,
 *          UBIDI_MIRRORING_OFF,
 *          U_SHAPE_DIGITS_EN2AN | U_SHAPE_DIGIT_TYPE_AN_EXTENDED,
 *          &errorCode);
 *\endcode
 * 
*

* * @param pBiDiTransform A pointer to a UBiDiTransform object * allocated with ubiditransform_open() or * NULL.

* This object serves for one-time setup to amortize initialization * overheads. Use of this object is not thread-safe. All other threads * should allocate a new UBiDiTransform object by calling * ubiditransform_open() before using it. Alternatively, * a caller can set this parameter to NULL, in which case * the object will be allocated by the engine on the fly.

* @param src A pointer to the text that the Bidi layout transformations will * be performed on. *

Note: the text must be (at least) * srcLength long.

* @param srcLength The length of the text, in number of UChars. If * length == -1 then the text must be zero-terminated. * @param dest A pointer to where the processed text is to be copied. * @param destSize The size of the dest buffer, in number of * UChars. If the U_SHAPE_LETTERS_UNSHAPE option is set, * then the destination length could be as large as * srcLength * 2. Otherwise, the destination length will * not exceed srcLength. If the caller reserves the last * position for zero-termination, it should be excluded from * destSize. *

destSize == -1 is allowed and makes sense when * dest was holds some meaningful value, e.g. that of * src. In this case dest must be * zero-terminated.

* @param inParaLevel A base embedding level of the input as defined in * ubidi_setPara documentation for the * paraLevel parameter. * @param inOrder An order of the input, which can be one of the * UBiDiOrder values. * @param outParaLevel A base embedding level of the output as defined in * ubidi_setPara documentation for the * paraLevel parameter. * @param outOrder An order of the output, which can be one of the * UBiDiOrder values. * @param doMirroring Indicates whether or not to perform character mirroring, * and can accept one of the UBiDiMirroring values. * @param shapingOptions Arabic digit and letter shaping options defined in the * ushape.h documentation. *

Note: Direction indicator options are computed by * the transformation engine based on the effective ordering schemes, so * user-defined direction indicators will be ignored.

* @param pErrorCode A pointer to an error code value. * * @return The destination length, i.e. the number of UChars written to * dest. If the transformation fails, the return value * will be 0 (and the error code will be written to * pErrorCode). * * @see UBiDiLevel * @see UBiDiOrder * @see UBiDiMirroring * @see ubidi_setPara * @see u_shapeArabic * @stable ICU 58 */ U_CAPI uint32_t U_EXPORT2 ubiditransform_transform(UBiDiTransform *pBiDiTransform, const UChar *src, int32_t srcLength, UChar *dest, int32_t destSize, UBiDiLevel inParaLevel, UBiDiOrder inOrder, UBiDiLevel outParaLevel, UBiDiOrder outOrder, UBiDiMirroring doMirroring, uint32_t shapingOptions, UErrorCode *pErrorCode); /** * Allocates a UBiDiTransform object. This object can be reused, * e.g. with different ordering schemes, mirroring or shaping options.

* Note:The object can only be reused in the same thread. * All other threads should allocate a new UBiDiTransform object * before using it.

* Example of usage:

*

 * \code
 * UErrorCode errorCode = U_ZERO_ERROR;
 * // Open a new UBiDiTransform.
 * UBiDiTransform* transform = ubiditransform_open(&errorCode);
 * // Run a transformation.
 * ubiditransform_transform(transform,
 *          text1, -1, text2, -1,
 *          UBIDI_RTL, UBIDI_LOGICAL,
 *          UBIDI_LTR, UBIDI_VISUAL,
 *          UBIDI_MIRRORING_ON,
 *          U_SHAPE_DIGITS_EN2AN,
 *          &errorCode);
 * // Do something with the output text and invoke another transformation using
 * //   that text as input.
 * ubiditransform_transform(transform,
 *          text2, -1, text3, -1,
 *          UBIDI_LTR, UBIDI_VISUAL,
 *          UBIDI_RTL, UBIDI_VISUAL,
 *          UBIDI_MIRRORING_ON,
 *          0, &errorCode);
 *\endcode
 * 
*

* The UBiDiTransform object must be deallocated by calling * ubiditransform_close(). * * @return An empty UBiDiTransform object. * @stable ICU 58 */ U_CAPI UBiDiTransform* U_EXPORT2 ubiditransform_open(UErrorCode *pErrorCode); /** * Deallocates the given UBiDiTransform object. * @stable ICU 58 */ U_CAPI void U_EXPORT2 ubiditransform_close(UBiDiTransform *pBidiTransform); #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5) #endif // utext.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2004-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: utext.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2004oct06 * created by: Markus W. Scherer */ #ifndef __UTEXT_H__ #define __UTEXT_H__ /** * \file * \brief C API: Abstract Unicode Text API * * The Text Access API provides a means to allow text that is stored in alternative * formats to work with ICU services. ICU normally operates on text that is * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type * UnicodeString for C++ APIs. * * ICU Text Access allows other formats, such as UTF-8 or non-contiguous * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. * * There are three general classes of usage for UText: * * Application Level Use. This is the simplest usage - applications would * use one of the utext_open() functions on their input text, and pass * the resulting UText to the desired ICU service. * * Second is usage in ICU Services, such as break iteration, that will need to * operate on input presented to them as a UText. These implementations * will need to use the iteration and related UText functions to gain * access to the actual text. * * The third class of UText users are "text providers." These are the * UText implementations for the various text storage formats. An application * or system with a unique text storage format can implement a set of * UText provider functions for that format, which will then allow * ICU services to operate on that format. * * * Iterating over text * * Here is sample code for a forward iteration over the contents of a UText * * \code * UChar32 c; * UText *ut = whatever(); * * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { * // do whatever with the codepoint c here. * } * \endcode * * And here is similar code to iterate in the reverse direction, from the end * of the text towards the beginning. * * \code * UChar32 c; * UText *ut = whatever(); * int textLength = utext_nativeLength(ut); * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { * // do whatever with the codepoint c here. * } * \endcode * * Characters and Indexing * * Indexing into text by UText functions is nearly always in terms of the native * indexing of the underlying text storage. The storage format could be UTF-8 * or UTF-32, for example. When coding to the UText access API, no assumptions * can be made regarding the size of characters, or how far an index * may move when iterating between characters. * * All indices supplied to UText functions are pinned to the length of the * text. An out-of-bounds index is not considered to be an error, but is * adjusted to be in the range 0 <= index <= length of input text. * * * When an index position is returned from a UText function, it will be * a native index to the underlying text. In the case of multi-unit characters, * it will always refer to the first position of the character, * never to the interior. This is essentially the same thing as saying that * a returned index will always point to a boundary between characters. * * When a native index is supplied to a UText function, all indices that * refer to any part of a multi-unit character representation are considered * to be equivalent. In the case of multi-unit characters, an incoming index * will be logically normalized to refer to the start of the character. * * It is possible to test whether a native index is on a code point boundary * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). * If the index is returned unchanged, it was on a code point boundary. If * an adjusted index is returned, the original index referred to the * interior of a character. * * Conventions for calling UText functions * * Most UText access functions have as their first parameter a (UText *) pointer, * which specifies the UText to be used. Unless otherwise noted, the * pointer must refer to a valid, open UText. Attempting to * use a closed UText or passing a NULL pointer is a programming error and * will produce undefined results or NULL pointer exceptions. * * The UText_Open family of functions can either open an existing (closed) * UText, or heap allocate a new UText. Here is sample code for creating * a stack-allocated UText. * * \code * char *s = whatever(); // A utf-8 string * U_ErrorCode status = U_ZERO_ERROR; * UText ut = UTEXT_INITIALIZER; * utext_openUTF8(ut, s, -1, &status); * if (U_FAILURE(status)) { * // error handling * } else { * // work with the UText * } * \endcode * * Any existing UText passed to an open function _must_ have been initialized, * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated * by an open function. Passing NULL will cause the open function to * heap-allocate and fully initialize a new UText. * */ U_CDECL_BEGIN struct UText; typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ /*************************************************************************************** * * C Functions for creating UText wrappers around various kinds of text strings. * ****************************************************************************************/ /** * Close function for UText instances. * Cleans up, releases any resources being held by an open UText. *

* If the UText was originally allocated by one of the utext_open functions, * the storage associated with the utext will also be freed. * If the UText storage originated with the application, as it would with * a local or static instance, the storage will not be deleted. * * An open UText can be reset to refer to new string by using one of the utext_open() * functions without first closing the UText. * * @param ut The UText to be closed. * @return NULL if the UText struct was deleted by the close. If the UText struct * was originally provided by the caller to the open function, it is * returned by this function, and may be safely used again in * a subsequent utext_open. * * @stable ICU 3.4 */ U_CAPI UText * U_EXPORT2 utext_close(UText *ut); /** * Open a read-only UText implementation for UTF-8 strings. * * \htmlonly * Any invalid UTF-8 in the input will be handled in this way: * a sequence of bytes that has the form of a truncated, but otherwise valid, * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. * Any other illegal bytes will each be replaced by a \uFFFD. * \endhtmlonly * * @param ut Pointer to a UText struct. If NULL, a new UText will be created. * If non-NULL, must refer to an initialized UText struct, which will then * be reset to reference the specified UTF-8 string. * @param s A UTF-8 string. Must not be NULL. * @param length The length of the UTF-8 string in bytes, or -1 if the string is * zero terminated. * @param status Errors are returned here. * @return A pointer to the UText. If a pre-allocated UText was provided, it * will always be used and returned. * @stable ICU 3.4 */ U_CAPI UText * U_EXPORT2 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); /** * Open a read-only UText for UChar * string. * * @param ut Pointer to a UText struct. If NULL, a new UText will be created. * If non-NULL, must refer to an initialized UText struct, which will then * be reset to reference the specified UChar string. * @param s A UChar (UTF-16) string * @param length The number of UChars in the input string, or -1 if the string is * zero terminated. * @param status Errors are returned here. * @return A pointer to the UText. If a pre-allocated UText was provided, it * will always be used and returned. * @stable ICU 3.4 */ U_CAPI UText * U_EXPORT2 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); /** * Clone a UText. This is much like opening a UText where the source text is itself * another UText. * * A deep clone will copy both the UText data structures and the underlying text. * The original and cloned UText will operate completely independently; modifications * made to the text in one will not affect the other. Text providers are not * required to support deep clones. The user of clone() must check the status return * and be prepared to handle failures. * * The standard UText implementations for UTF8, UChar *, UnicodeString and * Replaceable all support deep cloning. * * The UText returned from a deep clone will be writable, assuming that the text * provider is able to support writing, even if the source UText had been made * non-writable by means of UText_freeze(). * * A shallow clone replicates only the UText data structures; it does not make * a copy of the underlying text. Shallow clones can be used as an efficient way to * have multiple iterators active in a single text string that is not being * modified. * * A shallow clone operation will not fail, barring truly exceptional conditions such * as memory allocation failures. * * Shallow UText clones should be avoided if the UText functions that modify the * text are expected to be used, either on the original or the cloned UText. * Any such modifications can cause unpredictable behavior. Read Only * shallow clones provide some protection against errors of this type by * disabling text modification via the cloned UText. * * A shallow clone made with the readOnly parameter == false will preserve the * utext_isWritable() state of the source object. Note, however, that * write operations must be avoided while more than one UText exists that refer * to the same underlying text. * * A UText and its clone may be safely concurrently accessed by separate threads. * This is true for read access only with shallow clones, and for both read and * write access with deep clones. * It is the responsibility of the Text Provider to ensure that this thread safety * constraint is met. * * @param dest A UText struct to be filled in with the result of the clone operation, * or NULL if the clone function should heap-allocate a new UText struct. * If non-NULL, must refer to an already existing UText, which will then * be reset to become the clone. * @param src The UText to be cloned. * @param deep true to request a deep clone, false for a shallow clone. * @param readOnly true to request that the cloned UText have read only access to the * underlying text. * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR * will be returned if the text provider is unable to clone the * original text. * @return The newly created clone, or NULL if the clone operation failed. * @stable ICU 3.4 */ U_CAPI UText * U_EXPORT2 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); /** * Compare two UText objects for equality. * UTexts are equal if they are iterating over the same text, and * have the same iteration position within the text. * If either or both of the parameters are NULL, the comparison is false. * * @param a The first of the two UTexts to compare. * @param b The other UText to be compared. * @return true if the two UTexts are equal. * @stable ICU 3.6 */ U_CAPI UBool U_EXPORT2 utext_equals(const UText *a, const UText *b); /***************************************************************************** * * Functions to work with the text represented by a UText wrapper * *****************************************************************************/ /** * Get the length of the text. Depending on the characteristics * of the underlying text representation, this may be expensive. * @see utext_isLengthExpensive() * * * @param ut the text to be accessed. * @return the length of the text, expressed in native units. * * @stable ICU 3.4 */ U_CAPI int64_t U_EXPORT2 utext_nativeLength(UText *ut); /** * Return true if calculating the length of the text could be expensive. * Finding the length of NUL terminated strings is considered to be expensive. * * Note that the value of this function may change * as the result of other operations on a UText. * Once the length of a string has been discovered, it will no longer * be expensive to report it. * * @param ut the text to be accessed. * @return true if determining the length of the text could be time consuming. * @stable ICU 3.4 */ U_CAPI UBool U_EXPORT2 utext_isLengthExpensive(const UText *ut); /** * Returns the code point at the requested index, * or U_SENTINEL (-1) if it is out of bounds. * * If the specified index points to the interior of a multi-unit * character - one of the trail bytes of a UTF-8 sequence, for example - * the complete code point will be returned. * * The iteration position will be set to the start of the returned code point. * * This function is roughly equivalent to the sequence * utext_setNativeIndex(index); * utext_current32(); * (There is a subtle difference if the index is out of bounds by being less than zero - * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() * will return the char at zero. utext_char32At(negative index), on the other hand, will * return the U_SENTINEL value of -1.) * * @param ut the text to be accessed * @param nativeIndex the native index of the character to be accessed. If the index points * to other than the first unit of a multi-unit character, it will be adjusted * to the start of the character. * @return the code point at the specified index. * @stable ICU 3.4 */ U_CAPI UChar32 U_EXPORT2 utext_char32At(UText *ut, int64_t nativeIndex); /** * * Get the code point at the current iteration position, * or U_SENTINEL (-1) if the iteration has reached the end of * the input text. * * @param ut the text to be accessed. * @return the Unicode code point at the current iterator position. * @stable ICU 3.4 */ U_CAPI UChar32 U_EXPORT2 utext_current32(UText *ut); /** * Get the code point at the current iteration position of the UText, and * advance the position to the first index following the character. * * If the position is at the end of the text (the index following * the last character, which is also the length of the text), * return U_SENTINEL (-1) and do not advance the index. * * This is a post-increment operation. * * An inline macro version of this function, UTEXT_NEXT32(), * is available for performance critical use. * * @param ut the text to be accessed. * @return the Unicode code point at the iteration position. * @see UTEXT_NEXT32 * @stable ICU 3.4 */ U_CAPI UChar32 U_EXPORT2 utext_next32(UText *ut); /** * Move the iterator position to the character (code point) whose * index precedes the current position, and return that character. * This is a pre-decrement operation. * * If the initial position is at the start of the text (index of 0) * return U_SENTINEL (-1), and leave the position unchanged. * * An inline macro version of this function, UTEXT_PREVIOUS32(), * is available for performance critical use. * * @param ut the text to be accessed. * @return the previous UChar32 code point, or U_SENTINEL (-1) * if the iteration has reached the start of the text. * @see UTEXT_PREVIOUS32 * @stable ICU 3.4 */ U_CAPI UChar32 U_EXPORT2 utext_previous32(UText *ut); /** * Set the iteration index and return the code point at that index. * Leave the iteration index at the start of the following code point. * * This function is the most efficient and convenient way to * begin a forward iteration. The results are identical to the those * from the sequence * \code * utext_setIndex(); * utext_next32(); * \endcode * * @param ut the text to be accessed. * @param nativeIndex Iteration index, in the native units of the text provider. * @return Code point which starts at or before index, * or U_SENTINEL (-1) if it is out of bounds. * @stable ICU 3.4 */ U_CAPI UChar32 U_EXPORT2 utext_next32From(UText *ut, int64_t nativeIndex); /** * Set the iteration index, and return the code point preceding the * one specified by the initial index. Leave the iteration position * at the start of the returned code point. * * This function is the most efficient and convenient way to * begin a backwards iteration. * * @param ut the text to be accessed. * @param nativeIndex Iteration index in the native units of the text provider. * @return Code point preceding the one at the initial index, * or U_SENTINEL (-1) if it is out of bounds. * * @stable ICU 3.4 */ U_CAPI UChar32 U_EXPORT2 utext_previous32From(UText *ut, int64_t nativeIndex); /** * Get the current iterator position, which can range from 0 to * the length of the text. * The position is a native index into the input text, in whatever format it * may have (possibly UTF-8 for example), and may not always be the same as * the corresponding UChar (UTF-16) index. * The returned position will always be aligned to a code point boundary. * * @param ut the text to be accessed. * @return the current index position, in the native units of the text provider. * @stable ICU 3.4 */ U_CAPI int64_t U_EXPORT2 utext_getNativeIndex(const UText *ut); /** * Set the current iteration position to the nearest code point * boundary at or preceding the specified index. * The index is in the native units of the original input text. * If the index is out of range, it will be pinned to be within * the range of the input text. *

* It will usually be more efficient to begin an iteration * using the functions utext_next32From() or utext_previous32From() * rather than setIndex(). *

* Moving the index position to an adjacent character is best done * with utext_next32(), utext_previous32() or utext_moveIndex32(). * Attempting to do direct arithmetic on the index position is * complicated by the fact that the size (in native units) of a * character depends on the underlying representation of the character * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not * easily knowable. * * @param ut the text to be accessed. * @param nativeIndex the native unit index of the new iteration position. * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 utext_setNativeIndex(UText *ut, int64_t nativeIndex); /** * Move the iterator position by delta code points. The number of code points * is a signed number; a negative delta will move the iterator backwards, * towards the start of the text. *

* The index is moved by delta code points * forward or backward, but no further backward than to 0 and * no further forward than to utext_nativeLength(). * The resulting index value will be in between 0 and length, inclusive. * * @param ut the text to be accessed. * @param delta the signed number of code points to move the iteration position. * @return true if the position could be moved the requested number of positions while * staying within the range [0 - text length]. * @stable ICU 3.4 */ U_CAPI UBool U_EXPORT2 utext_moveIndex32(UText *ut, int32_t delta); /** * Get the native index of the character preceding the current position. * If the iteration position is already at the start of the text, zero * is returned. * The value returned is the same as that obtained from the following sequence, * but without the side effect of changing the iteration position. * * \code * UText *ut = whatever; * ... * utext_previous(ut) * utext_getNativeIndex(ut); * \endcode * * This function is most useful during forwards iteration, where it will get the * native index of the character most recently returned from utext_next(). * * @param ut the text to be accessed * @return the native index of the character preceding the current index position, * or zero if the current position is at the start of the text. * @stable ICU 3.6 */ U_CAPI int64_t U_EXPORT2 utext_getPreviousNativeIndex(UText *ut); /** * * Extract text from a UText into a UChar buffer. The range of text to be extracted * is specified in the native indices of the UText provider. These may not necessarily * be UTF-16 indices. *

* The size (number of 16 bit UChars) of the data to be extracted is returned. The * full number of UChars is returned, even when the extracted text is truncated * because the specified buffer size is too small. *

* The extracted string will (if you are a user) / must (if you are a text provider) * be NUL-terminated if there is sufficient space in the destination buffer. This * terminating NUL is not included in the returned length. *

* The iteration index is left at the position following the last extracted character. * * @param ut the UText from which to extract data. * @param nativeStart the native index of the first character to extract.\ * If the specified index is out of range, * it will be pinned to be within 0 <= index <= textLength * @param nativeLimit the native string index of the position following the last * character to extract. If the specified index is out of range, * it will be pinned to be within 0 <= index <= textLength. * nativeLimit must be >= nativeStart. * @param dest the UChar (UTF-16) buffer into which the extracted text is placed * @param destCapacity The size, in UChars, of the destination buffer. May be zero * for precomputing the required size. * @param status receives any error status. * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the * buffer was too small. Returns number of UChars for preflighting. * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. * * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 utext_extract(UText *ut, int64_t nativeStart, int64_t nativeLimit, UChar *dest, int32_t destCapacity, UErrorCode *status); /************************************************************************************ * * #define inline versions of selected performance-critical text access functions * Caution: do not use auto increment++ or decrement-- expressions * as parameters to these macros. * * For most use, where there is no extreme performance constraint, the * normal, non-inline functions are a better choice. The resulting code * will be smaller, and, if the need ever arises, easier to debug. * * These are implemented as #defines rather than real functions * because there is no fully portable way to do inline functions in plain C. * ************************************************************************************/ /** * inline version of utext_next32(), for performance-critical situations. * * Get the code point at the current iteration position of the UText, and * advance the position to the first index following the character. * This is a post-increment operation. * Returns U_SENTINEL (-1) if the position is at the end of the * text. * * @stable ICU 3.4 */ #define UTEXT_NEXT32(ut) \ ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) /** * inline version of utext_previous32(), for performance-critical situations. * * Move the iterator position to the character (code point) whose * index precedes the current position, and return that character. * This is a pre-decrement operation. * Returns U_SENTINEL (-1) if the position is at the start of the text. * * @stable ICU 3.4 */ #define UTEXT_PREVIOUS32(ut) \ ((ut)->chunkOffset > 0 && \ (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) /** * inline version of utext_getNativeIndex(), for performance-critical situations. * * Get the current iterator position, which can range from 0 to * the length of the text. * The position is a native index into the input text, in whatever format it * may have (possibly UTF-8 for example), and may not always be the same as * the corresponding UChar (UTF-16) index. * The returned position will always be aligned to a code point boundary. * * @stable ICU 3.6 */ #define UTEXT_GETNATIVEINDEX(ut) \ ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ (ut)->chunkNativeStart+(ut)->chunkOffset : \ (ut)->pFuncs->mapOffsetToNative(ut)) /** * inline version of utext_setNativeIndex(), for performance-critical situations. * * Set the current iteration position to the nearest code point * boundary at or preceding the specified index. * The index is in the native units of the original input text. * If the index is out of range, it will be pinned to be within * the range of the input text. * * @stable ICU 3.8 */ #define UTEXT_SETNATIVEINDEX(ut, ix) UPRV_BLOCK_MACRO_BEGIN { \ int64_t __offset = (ix) - (ut)->chunkNativeStart; \ if (__offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \ (ut)->chunkOffset=(int32_t)__offset; \ } else { \ utext_setNativeIndex((ut), (ix)); \ } \ } UPRV_BLOCK_MACRO_END /************************************************************************************ * * Functions related to writing or modifying the text. * These will work only with modifiable UTexts. Attempting to * modify a read-only UText will return an error status. * ************************************************************************************/ /** * Return true if the text can be written (modified) with utext_replace() or * utext_copy(). For the text to be writable, the text provider must * be of a type that supports writing and the UText must not be frozen. * * Attempting to modify text when utext_isWriteable() is false will fail - * the text will not be modified, and an error will be returned from the function * that attempted the modification. * * @param ut the UText to be tested. * @return true if the text is modifiable. * * @see utext_freeze() * @see utext_replace() * @see utext_copy() * @stable ICU 3.4 * */ U_CAPI UBool U_EXPORT2 utext_isWritable(const UText *ut); /** * Test whether there is meta data associated with the text. * @see Replaceable::hasMetaData() * * @param ut The UText to be tested * @return true if the underlying text includes meta data. * @stable ICU 3.4 */ U_CAPI UBool U_EXPORT2 utext_hasMetaData(const UText *ut); /** * Replace a range of the original text with a replacement text. * * Leaves the current iteration position at the position following the * newly inserted replacement text. * * This function is only available on UText types that support writing, * that is, ones where utext_isWritable() returns true. * * When using this function, there should be only a single UText opened onto the * underlying native text string. Behavior after a replace operation * on a UText is undefined for any other additional UTexts that refer to the * modified string. * * @param ut the UText representing the text to be operated on. * @param nativeStart the native index of the start of the region to be replaced * @param nativeLimit the native index of the character following the region to be replaced. * @param replacementText pointer to the replacement text * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. * @param status receives any error status. Possible errors include * U_NO_WRITE_PERMISSION * * @return The signed number of (native) storage units by which * the length of the text expanded or contracted. * * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 utext_replace(UText *ut, int64_t nativeStart, int64_t nativeLimit, const UChar *replacementText, int32_t replacementLength, UErrorCode *status); /** * * Copy or move a substring from one position to another within the text, * while retaining any metadata associated with the text. * This function is used to duplicate or reorder substrings. * The destination index must not overlap the source range. * * The text to be copied or moved is inserted at destIndex; * it does not replace or overwrite any existing text. * * The iteration position is left following the newly inserted text * at the destination position. * * This function is only available on UText types that support writing, * that is, ones where utext_isWritable() returns true. * * When using this function, there should be only a single UText opened onto the * underlying native text string. Behavior after a copy operation * on a UText is undefined in any other additional UTexts that refer to the * modified string. * * @param ut The UText representing the text to be operated on. * @param nativeStart The native index of the start of the region to be copied or moved * @param nativeLimit The native index of the character position following the region * to be copied. * @param destIndex The native destination index to which the source substring is * copied or moved. * @param move If true, then the substring is moved, not copied/duplicated. * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION * * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 utext_copy(UText *ut, int64_t nativeStart, int64_t nativeLimit, int64_t destIndex, UBool move, UErrorCode *status); /** *

* Freeze a UText. This prevents any modification to the underlying text itself * by means of functions operating on this UText. *

*

* Once frozen, a UText can not be unfrozen. The intent is to ensure * that a the text underlying a frozen UText wrapper cannot be modified via that UText. *

*

* Caution: freezing a UText will disable changes made via the specific * frozen UText wrapper only; it will not have any effect on the ability to * directly modify the text by bypassing the UText. Any such backdoor modifications * are always an error while UText access is occurring because the underlying * text can get out of sync with UText's buffering. *

* * @param ut The UText to be frozen. * @see utext_isWritable() * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 utext_freeze(UText *ut); /** * UText provider properties (bit field indexes). * * @see UText * @stable ICU 3.4 */ enum { /** * It is potentially time consuming for the provider to determine the length of the text. * @stable ICU 3.4 */ UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, /** * Text chunks remain valid and usable until the text object is modified or * deleted, not just until the next time the access() function is called * (which is the default). * @stable ICU 3.4 */ UTEXT_PROVIDER_STABLE_CHUNKS = 2, /** * The provider supports modifying the text via the replace() and copy() * functions. * @see Replaceable * @stable ICU 3.4 */ UTEXT_PROVIDER_WRITABLE = 3, /** * There is meta data associated with the text. * @see Replaceable::hasMetaData() * @stable ICU 3.4 */ UTEXT_PROVIDER_HAS_META_DATA = 4, /** * Text provider owns the text storage. * Generally occurs as the result of a deep clone of the UText. * When closing the UText, the associated text must * also be closed/deleted/freed/ whatever is appropriate. * @stable ICU 3.6 */ UTEXT_PROVIDER_OWNS_TEXT = 5 }; /** * Function type declaration for UText.clone(). * * clone a UText. Much like opening a UText where the source text is itself * another UText. * * A deep clone will copy both the UText data structures and the underlying text. * The original and cloned UText will operate completely independently; modifications * made to the text in one will not effect the other. Text providers are not * required to support deep clones. The user of clone() must check the status return * and be prepared to handle failures. * * A shallow clone replicates only the UText data structures; it does not make * a copy of the underlying text. Shallow clones can be used as an efficient way to * have multiple iterators active in a single text string that is not being * modified. * * A shallow clone operation must not fail except for truly exceptional conditions such * as memory allocation failures. * * A UText and its clone may be safely concurrently accessed by separate threads. * This is true for both shallow and deep clones. * It is the responsibility of the Text Provider to ensure that this thread safety * constraint is met. * * @param dest A UText struct to be filled in with the result of the clone operation, * or NULL if the clone function should heap-allocate a new UText struct. * @param src The UText to be cloned. * @param deep true to request a deep clone, false for a shallow clone. * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR * should be returned if the text provider is unable to clone the * original text. * @return The newly created clone, or NULL if the clone operation failed. * * @stable ICU 3.4 */ typedef UText * U_CALLCONV UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); /** * Function type declaration for UText.nativeLength(). * * @param ut the UText to get the length of. * @return the length, in the native units of the original text string. * @see UText * @stable ICU 3.4 */ typedef int64_t U_CALLCONV UTextNativeLength(UText *ut); /** * Function type declaration for UText.access(). Get the description of the text chunk * containing the text at a requested native index. The UText's iteration * position will be left at the requested index. If the index is out * of bounds, the iteration position will be left at the start or end * of the string, as appropriate. * * Chunks must begin and end on code point boundaries. A single code point * comprised of multiple storage units must never span a chunk boundary. * * * @param ut the UText being accessed. * @param nativeIndex Requested index of the text to be accessed. * @param forward If true, then the returned chunk must contain text * starting from the index, so that start<=index * The size (number of 16 bit UChars) in the data to be extracted is returned. The * full amount is returned, even when the specified buffer size is smaller. *

* The extracted string will (if you are a user) / must (if you are a text provider) * be NUL-terminated if there is sufficient space in the destination buffer. * * @param ut the UText from which to extract data. * @param nativeStart the native index of the first character to extract. * @param nativeLimit the native string index of the position following the last * character to extract. * @param dest the UChar (UTF-16) buffer into which the extracted text is placed * @param destCapacity The size, in UChars, of the destination buffer. May be zero * for precomputing the required size. * @param status receives any error status. * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for * preflighting. * @return Number of UChars in the data. Does not include a trailing NUL. * * @stable ICU 3.4 */ typedef int32_t U_CALLCONV UTextExtract(UText *ut, int64_t nativeStart, int64_t nativeLimit, UChar *dest, int32_t destCapacity, UErrorCode *status); /** * Function type declaration for UText.replace(). * * Replace a range of the original text with a replacement text. * * Leaves the current iteration position at the position following the * newly inserted replacement text. * * This function need only be implemented on UText types that support writing. * * When using this function, there should be only a single UText opened onto the * underlying native text string. The function is responsible for updating the * text chunk within the UText to reflect the updated iteration position, * taking into account any changes to the underlying string's structure caused * by the replace operation. * * @param ut the UText representing the text to be operated on. * @param nativeStart the index of the start of the region to be replaced * @param nativeLimit the index of the character following the region to be replaced. * @param replacementText pointer to the replacement text * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. * @param status receives any error status. Possible errors include * U_NO_WRITE_PERMISSION * * @return The signed number of (native) storage units by which * the length of the text expanded or contracted. * * @stable ICU 3.4 */ typedef int32_t U_CALLCONV UTextReplace(UText *ut, int64_t nativeStart, int64_t nativeLimit, const UChar *replacementText, int32_t replacmentLength, UErrorCode *status); /** * Function type declaration for UText.copy(). * * Copy or move a substring from one position to another within the text, * while retaining any metadata associated with the text. * This function is used to duplicate or reorder substrings. * The destination index must not overlap the source range. * * The text to be copied or moved is inserted at destIndex; * it does not replace or overwrite any existing text. * * This function need only be implemented for UText types that support writing. * * When using this function, there should be only a single UText opened onto the * underlying native text string. The function is responsible for updating the * text chunk within the UText to reflect the updated iteration position, * taking into account any changes to the underlying string's structure caused * by the replace operation. * * @param ut The UText representing the text to be operated on. * @param nativeStart The index of the start of the region to be copied or moved * @param nativeLimit The index of the character following the region to be replaced. * @param nativeDest The destination index to which the source substring is copied or moved. * @param move If true, then the substring is moved, not copied/duplicated. * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION * * @stable ICU 3.4 */ typedef void U_CALLCONV UTextCopy(UText *ut, int64_t nativeStart, int64_t nativeLimit, int64_t nativeDest, UBool move, UErrorCode *status); /** * Function type declaration for UText.mapOffsetToNative(). * Map from the current UChar offset within the current text chunk to * the corresponding native index in the original source text. * * This is required only for text providers that do not use native UTF-16 indexes. * * @param ut the UText. * @return Absolute (native) index corresponding to chunkOffset in the current chunk. * The returned native index should always be to a code point boundary. * * @stable ICU 3.4 */ typedef int64_t U_CALLCONV UTextMapOffsetToNative(const UText *ut); /** * Function type declaration for UText.mapIndexToUTF16(). * Map from a native index to a UChar offset within a text chunk. * Behavior is undefined if the native index does not fall within the * current chunk. * * This function is required only for text providers that do not use native UTF-16 indexes. * * @param ut The UText containing the text chunk. * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. * @return Chunk-relative UTF-16 offset corresponding to the specified native * index. * * @stable ICU 3.4 */ typedef int32_t U_CALLCONV UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); /** * Function type declaration for UText.utextClose(). * * A Text Provider close function is only required for provider types that make * allocations in their open function (or other functions) that must be * cleaned when the UText is closed. * * The allocation of the UText struct itself and any "extra" storage * associated with the UText is handled by the common UText implementation * and does not require provider specific cleanup in a close function. * * Most UText provider implementations do not need to implement this function. * * @param ut A UText object to be closed. * * @stable ICU 3.4 */ typedef void U_CALLCONV UTextClose(UText *ut); /** * (public) Function dispatch table for UText. * Conceptually very much like a C++ Virtual Function Table. * This struct defines the organization of the table. * Each text provider implementation must provide an * actual table that is initialized with the appropriate functions * for the type of text being handled. * @stable ICU 3.6 */ struct UTextFuncs { /** * (public) Function table size, sizeof(UTextFuncs) * Intended for use should the table grow to accommodate added * functions in the future, to allow tests for older format * function tables that do not contain the extensions. * * Fields are placed for optimal alignment on * 32/64/128-bit-pointer machines, by normally grouping together * 4 32-bit fields, * 4 pointers, * 2 64-bit fields * in sequence. * @stable ICU 3.6 */ int32_t tableSize; /** * (private) Alignment padding. * Do not use, reserved for use by the UText framework only. * @internal */ int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; /** * (public) Function pointer for UTextClone * * @see UTextClone * @stable ICU 3.6 */ UTextClone *clone; /** * (public) function pointer for UTextLength * May be expensive to compute! * * @see UTextLength * @stable ICU 3.6 */ UTextNativeLength *nativeLength; /** * (public) Function pointer for UTextAccess. * * @see UTextAccess * @stable ICU 3.6 */ UTextAccess *access; /** * (public) Function pointer for UTextExtract. * * @see UTextExtract * @stable ICU 3.6 */ UTextExtract *extract; /** * (public) Function pointer for UTextReplace. * * @see UTextReplace * @stable ICU 3.6 */ UTextReplace *replace; /** * (public) Function pointer for UTextCopy. * * @see UTextCopy * @stable ICU 3.6 */ UTextCopy *copy; /** * (public) Function pointer for UTextMapOffsetToNative. * * @see UTextMapOffsetToNative * @stable ICU 3.6 */ UTextMapOffsetToNative *mapOffsetToNative; /** * (public) Function pointer for UTextMapNativeIndexToUTF16. * * @see UTextMapNativeIndexToUTF16 * @stable ICU 3.6 */ UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; /** * (public) Function pointer for UTextClose. * * @see UTextClose * @stable ICU 3.6 */ UTextClose *close; /** * (private) Spare function pointer * @internal */ UTextClose *spare1; /** * (private) Spare function pointer * @internal */ UTextClose *spare2; /** * (private) Spare function pointer * @internal */ UTextClose *spare3; }; /** * Function dispatch table for UText * @see UTextFuncs */ typedef struct UTextFuncs UTextFuncs; /** * UText struct. Provides the interface between the generic UText access code * and the UText provider code that works on specific kinds of * text (UTF-8, noncontiguous UTF-16, whatever.) * * Applications that are using predefined types of text providers * to pass text data to ICU services will have no need to view the * internals of the UText structs that they open. * * @stable ICU 3.6 */ struct UText { /** * (private) Magic. Used to help detect when UText functions are handed * invalid or uninitialized UText structs. * utext_openXYZ() functions take an initialized, * but not necessarily open, UText struct as an * optional fill-in parameter. This magic field * is used to check for that initialization. * Text provider close functions must NOT clear * the magic field because that would prevent * reuse of the UText struct. * @internal */ uint32_t magic; /** * (private) Flags for managing the allocation and freeing of * memory associated with this UText. * @internal */ int32_t flags; /** * Text provider properties. This set of flags is maintained by the * text provider implementation. * @stable ICU 3.4 */ int32_t providerProperties; /** * (public) sizeOfStruct=sizeof(UText) * Allows possible backward compatible extension. * * @stable ICU 3.4 */ int32_t sizeOfStruct; /* ------ 16 byte alignment boundary ----------- */ /** * (protected) Native index of the first character position following * the current chunk. * @stable ICU 3.6 */ int64_t chunkNativeLimit; /** * (protected) Size in bytes of the extra space (pExtra). * @stable ICU 3.4 */ int32_t extraSize; /** * (protected) The highest chunk offset where native indexing and * chunk (UTF-16) indexing correspond. For UTF-16 sources, value * will be equal to chunkLength. * * @stable ICU 3.6 */ int32_t nativeIndexingLimit; /* ---- 16 byte alignment boundary------ */ /** * (protected) Native index of the first character in the text chunk. * @stable ICU 3.6 */ int64_t chunkNativeStart; /** * (protected) Current iteration position within the text chunk (UTF-16 buffer). * This is the index to the character that will be returned by utext_next32(). * @stable ICU 3.6 */ int32_t chunkOffset; /** * (protected) Length the text chunk (UTF-16 buffer), in UChars. * @stable ICU 3.6 */ int32_t chunkLength; /* ---- 16 byte alignment boundary-- */ /** * (protected) pointer to a chunk of text in UTF-16 format. * May refer either to original storage of the source of the text, or * if conversion was required, to a buffer owned by the UText. * @stable ICU 3.6 */ const UChar *chunkContents; /** * (public) Pointer to Dispatch table for accessing functions for this UText. * @stable ICU 3.6 */ const UTextFuncs *pFuncs; /** * (protected) Pointer to additional space requested by the * text provider during the utext_open operation. * @stable ICU 3.4 */ void *pExtra; /** * (protected) Pointer to string or text-containing object or similar. * This is the source of the text that this UText is wrapping, in a format * that is known to the text provider functions. * @stable ICU 3.4 */ const void *context; /* --- 16 byte alignment boundary--- */ /** * (protected) Pointer fields available for use by the text provider. * Not used by UText common code. * @stable ICU 3.6 */ const void *p; /** * (protected) Pointer fields available for use by the text provider. * Not used by UText common code. * @stable ICU 3.6 */ const void *q; /** * (protected) Pointer fields available for use by the text provider. * Not used by UText common code. * @stable ICU 3.6 */ const void *r; /** * Private field reserved for future use by the UText framework * itself. This is not to be touched by the text providers. * @internal ICU 3.4 */ void *privP; /* --- 16 byte alignment boundary--- */ /** * (protected) Integer field reserved for use by the text provider. * Not used by the UText framework, or by the client (user) of the UText. * @stable ICU 3.4 */ int64_t a; /** * (protected) Integer field reserved for use by the text provider. * Not used by the UText framework, or by the client (user) of the UText. * @stable ICU 3.4 */ int32_t b; /** * (protected) Integer field reserved for use by the text provider. * Not used by the UText framework, or by the client (user) of the UText. * @stable ICU 3.4 */ int32_t c; /* ---- 16 byte alignment boundary---- */ /** * Private field reserved for future use by the UText framework * itself. This is not to be touched by the text providers. * @internal ICU 3.4 */ int64_t privA; /** * Private field reserved for future use by the UText framework * itself. This is not to be touched by the text providers. * @internal ICU 3.4 */ int32_t privB; /** * Private field reserved for future use by the UText framework * itself. This is not to be touched by the text providers. * @internal ICU 3.4 */ int32_t privC; }; /** * Common function for use by Text Provider implementations to allocate and/or initialize * a new UText struct. To be called in the implementation of utext_open() functions. * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. * If the supplied UText is already open, the provider's close function will be called * so that the struct can be reused by the open that is in progress. * * @param ut pointer to a UText struct to be re-used, or null if a new UText * should be allocated. * @param extraSpace The amount of additional space to be allocated as part * of this UText, for use by types of providers that require * additional storage. * @param status Errors are returned here. * @return pointer to the UText, allocated if necessary, with extra space set up if requested. * @stable ICU 3.4 */ U_CAPI UText * U_EXPORT2 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status); // do not use #ifndef U_HIDE_INTERNAL_API around the following! /** * @internal * Value used to help identify correctly initialized UText structs. * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. */ enum { UTEXT_MAGIC = 0x345ad82c }; /** * initializer to be used with local (stack) instances of a UText * struct. UText structs must be initialized before passing * them to one of the utext_open functions. * * @stable ICU 3.6 */ #define UTEXT_INITIALIZER { \ UTEXT_MAGIC, /* magic */ \ 0, /* flags */ \ 0, /* providerProps */ \ sizeof(UText), /* sizeOfStruct */ \ 0, /* chunkNativeLimit */ \ 0, /* extraSize */ \ 0, /* nativeIndexingLimit */ \ 0, /* chunkNativeStart */ \ 0, /* chunkOffset */ \ 0, /* chunkLength */ \ NULL, /* chunkContents */ \ NULL, /* pFuncs */ \ NULL, /* pExtra */ \ NULL, /* context */ \ NULL, NULL, NULL, /* p, q, r */ \ NULL, /* privP */ \ 0, 0, 0, /* a, b, c */ \ 0, 0, 0 /* privA,B,C, */ \ } U_CDECL_END #endif // uset.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2002-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uset.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2002mar07 * created by: Markus W. Scherer * * C version of UnicodeSet. */ /** * \file * \brief C API: Unicode Set * *

This is a C wrapper around the C++ UnicodeSet class.

*/ #ifndef __USET_H__ #define __USET_H__ #ifndef USET_DEFINED #ifndef U_IN_DOXYGEN #define USET_DEFINED #endif /** * USet is the C API type corresponding to C++ class UnicodeSet. * Use the uset_* API to manipulate. Create with * uset_open*, and destroy with uset_close. * @stable ICU 2.4 */ typedef struct USet USet; #endif /** * Bitmask values to be passed to uset_openPatternOptions() or * uset_applyPattern() taking an option parameter. * @stable ICU 2.4 */ enum { /** * Ignore white space within patterns unless quoted or escaped. * @stable ICU 2.4 */ USET_IGNORE_SPACE = 1, /** * Enable case insensitive matching. E.g., "[ab]" with this flag * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will * match all except 'a', 'A', 'b', and 'B'. This performs a full * closure over case mappings, e.g. U+017F for s. * * The resulting set is a superset of the input for the code points but * not for the strings. * It performs a case mapping closure of the code points and adds * full case folding strings for the code points, and reduces strings of * the original set to their full case folding equivalents. * * This is designed for case-insensitive matches, for example * in regular expressions. The full code point case closure allows checking of * an input character directly against the closure set. * Strings are matched by comparing the case-folded form from the closure * set with an incremental case folding of the string in question. * * The closure set will also contain single code points if the original * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). * This is not necessary (that is, redundant) for the above matching method * but results in the same closure sets regardless of whether the original * set contained the code point or a string. * * @stable ICU 2.4 */ USET_CASE_INSENSITIVE = 2, /** * Enable case insensitive matching. E.g., "[ab]" with this flag * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, * title-, and uppercase mappings as well as the case folding * of each existing element in the set. * @stable ICU 3.2 */ USET_ADD_CASE_MAPPINGS = 4 }; /** * Argument values for whether span() and similar functions continue while * the current character is contained vs. not contained in the set. * * The functionality is straightforward for sets with only single code points, * without strings (which is the common case): * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. * - span() and spanBack() partition any string the same way when * alternating between span(USET_SPAN_NOT_CONTAINED) and * span(either "contained" condition). * - Using a complemented (inverted) set and the opposite span conditions * yields the same results. * * When a set contains multi-code point strings, then these statements may not * be true, depending on the strings in the set (for example, whether they * overlap with each other) and the string that is processed. * For a set with strings: * - The complement of the set contains the opposite set of code points, * but the same set of strings. * Therefore, complementing both the set and the span conditions * may yield different results. * - When starting spans at different positions in a string * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different * because a set string may start before the later position. * - span(USET_SPAN_SIMPLE) may be shorter than * span(USET_SPAN_CONTAINED) because it will not recursively try * all possible paths. * For example, with a set which contains the three strings "xy", "xya" and "ax", * span("xyax", USET_SPAN_CONTAINED) will return 4 but * span("xyax", USET_SPAN_SIMPLE) will return 3. * span(USET_SPAN_SIMPLE) will never be longer than * span(USET_SPAN_CONTAINED). * - With either "contained" condition, span() and spanBack() may partition * a string in different ways. * For example, with a set which contains the two strings "ab" and "ba", * and when processing the string "aba", * span() will yield contained/not-contained boundaries of { 0, 2, 3 } * while spanBack() will yield boundaries of { 0, 1, 3 }. * * Note: If it is important to get the same boundaries whether iterating forward * or backward through a string, then either only span() should be used and * the boundaries cached for backward operation, or an ICU BreakIterator * could be used. * * Note: Unpaired surrogates are treated like surrogate code points. * Similarly, set strings match only on code point boundaries, * never in the middle of a surrogate pair. * Illegal UTF-8 sequences are treated like U+FFFD. * When processing UTF-8 strings, malformed set strings * (strings with unpaired surrogates which cannot be converted to UTF-8) * are ignored. * * @stable ICU 3.8 */ typedef enum USetSpanCondition { /** * Continues a span() while there is no set element at the current position. * Increments by one code point at a time. * Stops before the first set element (character or string). * (For code points only, this is like while contains(current)==false). * * When span() returns, the substring between where it started and the position * it returned consists only of characters that are not in the set, * and none of its strings overlap with the span. * * @stable ICU 3.8 */ USET_SPAN_NOT_CONTAINED = 0, /** * Spans the longest substring that is a concatenation of set elements (characters or strings). * (For characters only, this is like while contains(current)==true). * * When span() returns, the substring between where it started and the position * it returned consists only of set elements (characters or strings) that are in the set. * * If a set contains strings, then the span will be the longest substring for which there * exists at least one non-overlapping concatenation of set elements (characters or strings). * This is equivalent to a POSIX regular expression for (OR of each set element)*. * (Java/ICU/Perl regex stops at the first match of an OR.) * * @stable ICU 3.8 */ USET_SPAN_CONTAINED = 1, /** * Continues a span() while there is a set element at the current position. * Increments by the longest matching element at each position. * (For characters only, this is like while contains(current)==true). * * When span() returns, the substring between where it started and the position * it returned consists only of set elements (characters or strings) that are in the set. * * If a set only contains single characters, then this is the same * as USET_SPAN_CONTAINED. * * If a set contains strings, then the span will be the longest substring * with a match at each position with the longest single set element (character or string). * * Use this span condition together with other longest-match algorithms, * such as ICU converters (ucnv_getUnicodeSet()). * * @stable ICU 3.8 */ USET_SPAN_SIMPLE = 2, } USetSpanCondition; enum { /** * Capacity of USerializedSet::staticArray. * Enough for any single-code point set. * Also provides padding for nice sizeof(USerializedSet). * @stable ICU 2.4 */ USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 }; /** * A serialized form of a Unicode set. Limited manipulations are * possible directly on a serialized set. See below. * @stable ICU 2.4 */ typedef struct USerializedSet { /** * The serialized Unicode Set. * @stable ICU 2.4 */ const uint16_t *array; /** * The length of the array that contains BMP characters. * @stable ICU 2.4 */ int32_t bmpLength; /** * The total length of the array. * @stable ICU 2.4 */ int32_t length; /** * A small buffer for the array to reduce memory allocations. * @stable ICU 2.4 */ uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; } USerializedSet; /********************************************************************* * USet API *********************************************************************/ /** * Create an empty USet object. * Equivalent to uset_open(1, 0). * @return a newly created USet. The caller must call uset_close() on * it when done. * @stable ICU 4.2 */ U_CAPI USet* U_EXPORT2 uset_openEmpty(void); /** * Creates a USet object that contains the range of characters * start..end, inclusive. If start > end * then an empty set is created (same as using uset_openEmpty()). * @param start first character of the range, inclusive * @param end last character of the range, inclusive * @return a newly created USet. The caller must call uset_close() on * it when done. * @stable ICU 2.4 */ U_CAPI USet* U_EXPORT2 uset_open(UChar32 start, UChar32 end); /** * Creates a set from the given pattern. See the UnicodeSet class * description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param patternLength the length of the pattern, or -1 if null * terminated * @param ec the error code * @stable ICU 2.4 */ U_CAPI USet* U_EXPORT2 uset_openPattern(const UChar* pattern, int32_t patternLength, UErrorCode* ec); /** * Creates a set from the given pattern. See the UnicodeSet class * description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param patternLength the length of the pattern, or -1 if null * terminated * @param options bitmask for options to apply to the pattern. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. * @param ec the error code * @stable ICU 2.4 */ U_CAPI USet* U_EXPORT2 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, uint32_t options, UErrorCode* ec); /** * Disposes of the storage used by a USet object. This function should * be called exactly once for objects returned by uset_open(). * @param set the object to dispose of * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 uset_close(USet* set); /** * Returns a copy of this object. * If this set is frozen, then the clone will be frozen as well. * Use uset_cloneAsThawed() for a mutable clone of a frozen set. * @param set the original set * @return the newly allocated copy of the set * @see uset_cloneAsThawed * @stable ICU 3.8 */ U_CAPI USet * U_EXPORT2 uset_clone(const USet *set); /** * Determines whether the set has been frozen (made immutable) or not. * See the ICU4J Freezable interface for details. * @param set the set * @return true/false for whether the set has been frozen * @see uset_freeze * @see uset_cloneAsThawed * @stable ICU 3.8 */ U_CAPI UBool U_EXPORT2 uset_isFrozen(const USet *set); /** * Freeze the set (make it immutable). * Once frozen, it cannot be unfrozen and is therefore thread-safe * until it is deleted. * See the ICU4J Freezable interface for details. * Freezing the set may also make some operations faster, for example * uset_contains() and uset_span(). * A frozen set will not be modified. (It remains frozen.) * @param set the set * @return the same set, now frozen * @see uset_isFrozen * @see uset_cloneAsThawed * @stable ICU 3.8 */ U_CAPI void U_EXPORT2 uset_freeze(USet *set); /** * Clone the set and make the clone mutable. * See the ICU4J Freezable interface for details. * @param set the set * @return the mutable clone * @see uset_freeze * @see uset_isFrozen * @see uset_clone * @stable ICU 3.8 */ U_CAPI USet * U_EXPORT2 uset_cloneAsThawed(const USet *set); /** * Causes the USet object to represent the range start - end. * If start > end then this USet is set to an empty range. * A frozen set will not be modified. * @param set the object to set to the given range * @param start first character in the set, inclusive * @param end last character in the set, inclusive * @stable ICU 3.2 */ U_CAPI void U_EXPORT2 uset_set(USet* set, UChar32 start, UChar32 end); /** * Modifies the set to represent the set specified by the given * pattern. See the UnicodeSet class description for the syntax of * the pattern language. See also the User Guide chapter about UnicodeSet. * Empties the set passed before applying the pattern. * A frozen set will not be modified. * @param set The set to which the pattern is to be applied. * @param pattern A pointer to UChar string specifying what characters are in the set. * The character at pattern[0] must be a '['. * @param patternLength The length of the UChar string. -1 if NUL terminated. * @param options A bitmask for options to apply to the pattern. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. * @param status Returns an error if the pattern cannot be parsed. * @return Upon successful parse, the value is either * the index of the character after the closing ']' * of the parsed pattern. * If the status code indicates failure, then the return value * is the index of the error in the source. * * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 uset_applyPattern(USet *set, const UChar *pattern, int32_t patternLength, uint32_t options, UErrorCode *status); /** * Modifies the set to contain those code points which have the given value * for the given binary or enumerated property, as returned by * u_getIntPropertyValue. Prior contents of this set are lost. * A frozen set will not be modified. * * @param set the object to contain the code points defined by the property * * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. * * @param value a value in the range u_getIntPropertyMinValue(prop).. * u_getIntPropertyMaxValue(prop), with one exception. If prop is * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but * rather a mask value produced by U_GET_GC_MASK(). This allows grouped * categories such as [:L:] to be represented. * * @param ec error code input/output parameter * * @stable ICU 3.2 */ U_CAPI void U_EXPORT2 uset_applyIntPropertyValue(USet* set, UProperty prop, int32_t value, UErrorCode* ec); /** * Modifies the set to contain those code points which have the * given value for the given property. Prior contents of this * set are lost. * A frozen set will not be modified. * * @param set the object to contain the code points defined by the given * property and value alias * * @param prop a string specifying a property alias, either short or long. * The name is matched loosely. See PropertyAliases.txt for names and a * description of loose matching. If the value string is empty, then this * string is interpreted as either a General_Category value alias, a Script * value alias, a binary property alias, or a special ID. Special IDs are * matched loosely and correspond to the following sets: * * "ANY" = [\\u0000-\\U0010FFFF], * "ASCII" = [\\u0000-\\u007F], * "Assigned" = [:^Cn:]. * * @param propLength the length of the prop, or -1 if NULL * * @param value a string specifying a value alias, either short or long. * The name is matched loosely. See PropertyValueAliases.txt for names * and a description of loose matching. In addition to aliases listed, * numeric values and canonical combining classes may be expressed * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string * may also be empty. * * @param valueLength the length of the value, or -1 if NULL * * @param ec error code input/output parameter * * @stable ICU 3.2 */ U_CAPI void U_EXPORT2 uset_applyPropertyAlias(USet* set, const UChar *prop, int32_t propLength, const UChar *value, int32_t valueLength, UErrorCode* ec); /** * Return true if the given position, in the given pattern, appears * to be the start of a UnicodeSet pattern. * * @param pattern a string specifying the pattern * @param patternLength the length of the pattern, or -1 if NULL * @param pos the given position * @stable ICU 3.2 */ U_CAPI UBool U_EXPORT2 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, int32_t pos); /** * Returns a string representation of this set. If the result of * calling this function is passed to a uset_openPattern(), it * will produce another set that is equal to this one. * @param set the set * @param result the string to receive the rules, may be NULL * @param resultCapacity the capacity of result, may be 0 if result is NULL * @param escapeUnprintable if true then convert unprintable * character to their hex escape representations, \\uxxxx or * \\Uxxxxxxxx. Unprintable characters are those other than * U+000A, U+0020..U+007E. * @param ec error code. * @return length of string, possibly larger than resultCapacity * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 uset_toPattern(const USet* set, UChar* result, int32_t resultCapacity, UBool escapeUnprintable, UErrorCode* ec); /** * Adds the given character to the given USet. After this call, * uset_contains(set, c) will return true. * A frozen set will not be modified. * @param set the object to which to add the character * @param c the character to add * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 uset_add(USet* set, UChar32 c); /** * Adds all of the elements in the specified set to this set if * they're not already present. This operation effectively * modifies this set so that its value is the union of the two * sets. The behavior of this operation is unspecified if the specified * collection is modified while the operation is in progress. * A frozen set will not be modified. * * @param set the object to which to add the set * @param additionalSet the source set whose elements are to be added to this set. * @stable ICU 2.6 */ U_CAPI void U_EXPORT2 uset_addAll(USet* set, const USet *additionalSet); /** * Adds the given range of characters to the given USet. After this call, * uset_contains(set, start, end) will return true. * A frozen set will not be modified. * @param set the object to which to add the character * @param start the first character of the range to add, inclusive * @param end the last character of the range to add, inclusive * @stable ICU 2.2 */ U_CAPI void U_EXPORT2 uset_addRange(USet* set, UChar32 start, UChar32 end); /** * Adds the given string to the given USet. After this call, * uset_containsString(set, str, strLen) will return true. * A frozen set will not be modified. * @param set the object to which to add the character * @param str the string to add * @param strLen the length of the string or -1 if null terminated. * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 uset_addString(USet* set, const UChar* str, int32_t strLen); /** * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} * If this set already any particular character, it has no effect on that character. * A frozen set will not be modified. * @param set the object to which to add the character * @param str the source string * @param strLen the length of the string or -1 if null terminated. * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); /** * Removes the given character from the given USet. After this call, * uset_contains(set, c) will return false. * A frozen set will not be modified. * @param set the object from which to remove the character * @param c the character to remove * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 uset_remove(USet* set, UChar32 c); /** * Removes the given range of characters from the given USet. After this call, * uset_contains(set, start, end) will return false. * A frozen set will not be modified. * @param set the object to which to add the character * @param start the first character of the range to remove, inclusive * @param end the last character of the range to remove, inclusive * @stable ICU 2.2 */ U_CAPI void U_EXPORT2 uset_removeRange(USet* set, UChar32 start, UChar32 end); /** * Removes the given string to the given USet. After this call, * uset_containsString(set, str, strLen) will return false. * A frozen set will not be modified. * @param set the object to which to add the character * @param str the string to remove * @param strLen the length of the string or -1 if null terminated. * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 uset_removeString(USet* set, const UChar* str, int32_t strLen); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"} * A frozen set will not be modified. * * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Removes from this set all of its elements that are contained in the * specified set. This operation effectively modifies this * set so that its value is the asymmetric set difference of * the two sets. * A frozen set will not be modified. * @param set the object from which the elements are to be removed * @param removeSet the object that defines which elements will be * removed from this set * @stable ICU 3.2 */ U_CAPI void U_EXPORT2 uset_removeAll(USet* set, const USet* removeSet); /** * Retain only the elements in this set that are contained in the * specified range. If start > end then an empty range is * retained, leaving the set empty. This is equivalent to * a boolean logic AND, or a set INTERSECTION. * A frozen set will not be modified. * * @param set the object for which to retain only the specified range * @param start first character, inclusive, of range to be retained * to this set. * @param end last character, inclusive, of range to be retained * to this set. * @stable ICU 3.2 */ U_CAPI void U_EXPORT2 uset_retain(USet* set, UChar32 start, UChar32 end); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Retains only the specified string from this set if it is present. * Upon return this set will be empty if it did not contain s, or * will only contain s if it did contain s. * A frozen set will not be modified. * * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_retainString(USet *set, const UChar *str, int32_t length); /** * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} * A frozen set will not be modified. * * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Retains only the elements in this set that are contained in the * specified set. In other words, removes from this set all of * its elements that are not contained in the specified set. This * operation effectively modifies this set so that its value is * the intersection of the two sets. * A frozen set will not be modified. * * @param set the object on which to perform the retain * @param retain set that defines which elements this set will retain * @stable ICU 3.2 */ U_CAPI void U_EXPORT2 uset_retainAll(USet* set, const USet* retain); /** * Reallocate this objects internal structures to take up the least * possible space, without changing this object's value. * A frozen set will not be modified. * * @param set the object on which to perform the compact * @stable ICU 3.2 */ U_CAPI void U_EXPORT2 uset_compact(USet* set); /** * Inverts this set. This operation modifies this set so that * its value is its complement. This operation does not affect * the multicharacter strings, if any. * A frozen set will not be modified. * @param set the set * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 uset_complement(USet* set); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Complements the specified range in this set. Any character in * the range will be removed if it is in this set, or will be * added if it is not in this set. If start > end * then an empty range is complemented, leaving the set unchanged. * This is equivalent to a boolean logic XOR. * A frozen set will not be modified. * * @param set the object to be modified * @param start first character, inclusive, of range * @param end last character, inclusive, of range * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementRange(USet *set, UChar32 start, UChar32 end); /** * Complements the specified string in this set. * The string will be removed if it is in this set, or will be added if it is not in this set. * A frozen set will not be modified. * * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementString(USet *set, const UChar *str, int32_t length); /** * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"} * A frozen set will not be modified. * * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Complements in this set all elements contained in the specified * set. Any character in the other set will be removed if it is * in this set, or will be added if it is not in this set. * A frozen set will not be modified. * * @param set the set with which to complement * @param complement set that defines which elements will be xor'ed * from this set. * @stable ICU 3.2 */ U_CAPI void U_EXPORT2 uset_complementAll(USet* set, const USet* complement); /** * Removes all of the elements from this set. This set will be * empty after this call returns. * A frozen set will not be modified. * @param set the set * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 uset_clear(USet* set); /** * Close this set over the given attribute. For the attribute * USET_CASE, the result is to modify this set so that: * * 1. For each character or string 'a' in this set, all strings or * characters 'b' such that foldCase(a) == foldCase(b) are added * to this set. * * 2. For each string 'e' in the resulting set, if e != * foldCase(e), 'e' will be removed. * * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] * * (Here foldCase(x) refers to the operation u_strFoldCase, and a * == b denotes that the contents are the same, not pointer * comparison.) * * A frozen set will not be modified. * * @param set the set * * @param attributes bitmask for attributes to close over. * Currently only the USET_CASE bit is supported. Any undefined bits * are ignored. * @stable ICU 4.2 */ U_CAPI void U_EXPORT2 uset_closeOver(USet* set, int32_t attributes); /** * Remove all strings from this set. * * @param set the set * @stable ICU 4.2 */ U_CAPI void U_EXPORT2 uset_removeAllStrings(USet* set); /** * Returns true if the given USet contains no characters and no * strings. * @param set the set * @return true if set is empty * @stable ICU 2.4 */ U_CAPI UBool U_EXPORT2 uset_isEmpty(const USet* set); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * @param set the set * @return true if this set contains multi-character strings or the empty string. * @stable ICU 70 */ U_CAPI UBool U_EXPORT2 uset_hasStrings(const USet *set); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Returns true if the given USet contains the given character. * This function works faster with a frozen set. * @param set the set * @param c The codepoint to check for within the set * @return true if set contains c * @stable ICU 2.4 */ U_CAPI UBool U_EXPORT2 uset_contains(const USet* set, UChar32 c); /** * Returns true if the given USet contains all characters c * where start <= c && c <= end. * @param set the set * @param start the first character of the range to test, inclusive * @param end the last character of the range to test, inclusive * @return true if set contains the range * @stable ICU 2.2 */ U_CAPI UBool U_EXPORT2 uset_containsRange(const USet* set, UChar32 start, UChar32 end); /** * Returns true if the given USet contains the given string. * @param set the set * @param str the string * @param strLen the length of the string or -1 if null terminated. * @return true if set contains str * @stable ICU 2.4 */ U_CAPI UBool U_EXPORT2 uset_containsString(const USet* set, const UChar* str, int32_t strLen); /** * Returns the index of the given character within this set, where * the set is ordered by ascending code point. If the character * is not in this set, return -1. The inverse of this method is * charAt(). * @param set the set * @param c the character to obtain the index for * @return an index from 0..size()-1, or -1 * @stable ICU 3.2 */ U_CAPI int32_t U_EXPORT2 uset_indexOf(const USet* set, UChar32 c); /** * Returns the character at the given index within this set, where * the set is ordered by ascending code point. If the index is * out of range, return (UChar32)-1. The inverse of this method is * indexOf(). * @param set the set * @param charIndex an index from 0..size()-1 to obtain the char for * @return the character at the given index, or (UChar32)-1. * @stable ICU 3.2 */ U_CAPI UChar32 U_EXPORT2 uset_charAt(const USet* set, int32_t charIndex); /** * Returns the number of characters and strings contained in the given * USet. * @param set the set * @return a non-negative integer counting the characters and strings * contained in set * @stable ICU 2.4 * @see uset_getRangeCount */ U_CAPI int32_t U_EXPORT2 uset_size(const USet* set); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * @param set the set * @return the number of ranges in this set. * @stable ICU 70 * @see uset_getItemCount * @see uset_getItem * @see uset_size */ U_CAPI int32_t U_EXPORT2 uset_getRangeCount(const USet *set); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Returns the number of items in this set. An item is either a range * of characters or a single multicharacter string. * @param set the set * @return a non-negative integer counting the character ranges * and/or strings contained in set * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 uset_getItemCount(const USet* set); /** * Returns an item of this set. An item is either a range of * characters or a single multicharacter string. * @param set the set * @param itemIndex a non-negative integer in the range 0.. * uset_getItemCount(set)-1 * @param start pointer to variable to receive first character * in range, inclusive * @param end pointer to variable to receive last character in range, * inclusive * @param str buffer to receive the string, may be NULL * @param strCapacity capacity of str, or 0 if str is NULL * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range * @return the length of the string (0 or >= 2), or 0 if the item is a range, * or -1 if the itemIndex is out of range * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 uset_getItem(const USet* set, int32_t itemIndex, UChar32* start, UChar32* end, UChar* str, int32_t strCapacity, UErrorCode* ec); /** * Returns true if set1 contains all the characters and strings * of set2. It answers the question, 'Is set1 a superset of set2?' * @param set1 set to be checked for containment * @param set2 set to be checked for containment * @return true if the test condition is met * @stable ICU 3.2 */ U_CAPI UBool U_EXPORT2 uset_containsAll(const USet* set1, const USet* set2); /** * Returns true if this set contains all the characters * of the given string. This is does not check containment of grapheme * clusters, like uset_containsString. * @param set set of characters to be checked for containment * @param str string containing codepoints to be checked for containment * @param strLen the length of the string or -1 if null terminated. * @return true if the test condition is met * @stable ICU 3.4 */ U_CAPI UBool U_EXPORT2 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); /** * Returns true if set1 contains none of the characters and strings * of set2. It answers the question, 'Is set1 a disjoint set of set2?' * @param set1 set to be checked for containment * @param set2 set to be checked for containment * @return true if the test condition is met * @stable ICU 3.2 */ U_CAPI UBool U_EXPORT2 uset_containsNone(const USet* set1, const USet* set2); /** * Returns true if set1 contains some of the characters and strings * of set2. It answers the question, 'Does set1 and set2 have an intersection?' * @param set1 set to be checked for containment * @param set2 set to be checked for containment * @return true if the test condition is met * @stable ICU 3.2 */ U_CAPI UBool U_EXPORT2 uset_containsSome(const USet* set1, const USet* set2); /** * Returns the length of the initial substring of the input string which * consists only of characters and strings that are contained in this set * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), * or only of characters and strings that are not contained * in this set (USET_SPAN_NOT_CONTAINED). * See USetSpanCondition for details. * Similar to the strspn() C library function. * Unpaired surrogates are treated according to contains() of their surrogate code points. * This function works faster with a frozen set and with a non-negative string length argument. * @param set the set * @param s start of the string * @param length of the string; can be -1 for NUL-terminated * @param spanCondition specifies the containment condition * @return the length of the initial substring according to the spanCondition; * 0 if the start of the string does not fit the spanCondition * @stable ICU 3.8 * @see USetSpanCondition */ U_CAPI int32_t U_EXPORT2 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); /** * Returns the start of the trailing substring of the input string which * consists only of characters and strings that are contained in this set * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), * or only of characters and strings that are not contained * in this set (USET_SPAN_NOT_CONTAINED). * See USetSpanCondition for details. * Unpaired surrogates are treated according to contains() of their surrogate code points. * This function works faster with a frozen set and with a non-negative string length argument. * @param set the set * @param s start of the string * @param length of the string; can be -1 for NUL-terminated * @param spanCondition specifies the containment condition * @return the start of the trailing substring according to the spanCondition; * the string length if the end of the string does not fit the spanCondition * @stable ICU 3.8 * @see USetSpanCondition */ U_CAPI int32_t U_EXPORT2 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); /** * Returns the length of the initial substring of the input string which * consists only of characters and strings that are contained in this set * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), * or only of characters and strings that are not contained * in this set (USET_SPAN_NOT_CONTAINED). * See USetSpanCondition for details. * Similar to the strspn() C library function. * Malformed byte sequences are treated according to contains(0xfffd). * This function works faster with a frozen set and with a non-negative string length argument. * @param set the set * @param s start of the string (UTF-8) * @param length of the string; can be -1 for NUL-terminated * @param spanCondition specifies the containment condition * @return the length of the initial substring according to the spanCondition; * 0 if the start of the string does not fit the spanCondition * @stable ICU 3.8 * @see USetSpanCondition */ U_CAPI int32_t U_EXPORT2 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); /** * Returns the start of the trailing substring of the input string which * consists only of characters and strings that are contained in this set * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), * or only of characters and strings that are not contained * in this set (USET_SPAN_NOT_CONTAINED). * See USetSpanCondition for details. * Malformed byte sequences are treated according to contains(0xfffd). * This function works faster with a frozen set and with a non-negative string length argument. * @param set the set * @param s start of the string (UTF-8) * @param length of the string; can be -1 for NUL-terminated * @param spanCondition specifies the containment condition * @return the start of the trailing substring according to the spanCondition; * the string length if the end of the string does not fit the spanCondition * @stable ICU 3.8 * @see USetSpanCondition */ U_CAPI int32_t U_EXPORT2 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); /** * Returns true if set1 contains all of the characters and strings * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' * @param set1 set to be checked for containment * @param set2 set to be checked for containment * @return true if the test condition is met * @stable ICU 3.2 */ U_CAPI UBool U_EXPORT2 uset_equals(const USet* set1, const USet* set2); /********************************************************************* * Serialized set API *********************************************************************/ /** * Serializes this set into an array of 16-bit integers. Serialization * (currently) only records the characters in the set; multicharacter * strings are ignored. * * The array * has following format (each line is one 16-bit integer): * * length = (n+2*m) | (m!=0?0x8000:0) * bmpLength = n; present if m!=0 * bmp[0] * bmp[1] * ... * bmp[n-1] * supp-high[0] * supp-low[0] * supp-high[1] * supp-low[1] * ... * supp-high[m-1] * supp-low[m-1] * * The array starts with a header. After the header are n bmp * code points, then m supplementary code points. Either n or m * or both may be zero. n+2*m is always <= 0x7FFF. * * If there are no supplementary characters (if m==0) then the * header is one 16-bit integer, 'length', with value n. * * If there are supplementary characters (if m!=0) then the header * is two 16-bit integers. The first, 'length', has value * (n+2*m)|0x8000. The second, 'bmpLength', has value n. * * After the header the code points are stored in ascending order. * Supplementary code points are stored as most significant 16 * bits followed by least significant 16 bits. * * @param set the set * @param dest pointer to buffer of destCapacity 16-bit integers. * May be NULL only if destCapacity is zero. * @param destCapacity size of dest, or zero. Must not be negative. * @param pErrorCode pointer to the error code. Will be set to * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. * @return the total length of the serialized format, including * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other * than U_BUFFER_OVERFLOW_ERROR. * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); /** * Given a serialized array, fill in the given serialized set object. * @param fillSet pointer to result * @param src pointer to start of array * @param srcLength length of array * @return true if the given array is valid, otherwise false * @stable ICU 2.4 */ U_CAPI UBool U_EXPORT2 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); /** * Set the USerializedSet to contain the given character (and nothing * else). * @param fillSet pointer to result * @param c The codepoint to set * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); /** * Returns true if the given USerializedSet contains the given * character. * @param set the serialized set * @param c The codepoint to check for within the set * @return true if set contains c * @stable ICU 2.4 */ U_CAPI UBool U_EXPORT2 uset_serializedContains(const USerializedSet* set, UChar32 c); /** * Returns the number of disjoint ranges of characters contained in * the given serialized set. Ignores any strings contained in the * set. * @param set the serialized set * @return a non-negative integer counting the character ranges * contained in set * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 uset_getSerializedRangeCount(const USerializedSet* set); /** * Returns a range of characters contained in the given serialized * set. * @param set the serialized set * @param rangeIndex a non-negative integer in the range 0.. * uset_getSerializedRangeCount(set)-1 * @param pStart pointer to variable to receive first character * in range, inclusive * @param pEnd pointer to variable to receive last character in range, * inclusive * @return true if rangeIndex is valid, otherwise false * @stable ICU 2.4 */ U_CAPI UBool U_EXPORT2 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, UChar32* pStart, UChar32* pEnd); #endif // unorm2.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2009-2015, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: unorm2.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2009dec15 * created by: Markus W. Scherer */ #ifndef __UNORM2_H__ #define __UNORM2_H__ /** * \file * \brief C API: New API for Unicode Normalization. * * Unicode normalization functionality for standard Unicode normalization or * for using custom mapping tables. * All instances of UNormalizer2 are unmodifiable/immutable. * Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller. * For more details see the Normalizer2 C++ class. */ /** * Constants for normalization modes. * For details about standard Unicode normalization forms * and about the algorithms which are also used with custom mapping tables * see http://www.unicode.org/unicode/reports/tr15/ * @stable ICU 4.4 */ typedef enum { /** * Decomposition followed by composition. * Same as standard NFC when using an "nfc" instance. * Same as standard NFKC when using an "nfkc" instance. * For details about standard Unicode normalization forms * see http://www.unicode.org/unicode/reports/tr15/ * @stable ICU 4.4 */ UNORM2_COMPOSE, /** * Map, and reorder canonically. * Same as standard NFD when using an "nfc" instance. * Same as standard NFKD when using an "nfkc" instance. * For details about standard Unicode normalization forms * see http://www.unicode.org/unicode/reports/tr15/ * @stable ICU 4.4 */ UNORM2_DECOMPOSE, /** * "Fast C or D" form. * If a string is in this form, then further decomposition without reordering * would yield the same form as DECOMPOSE. * Text in "Fast C or D" form can be processed efficiently with data tables * that are "canonically closed", that is, that provide equivalent data for * equivalent text, without having to be fully normalized. * Not a standard Unicode normalization form. * Not a unique form: Different FCD strings can be canonically equivalent. * For details see http://www.unicode.org/notes/tn5/#FCD * @stable ICU 4.4 */ UNORM2_FCD, /** * Compose only contiguously. * Also known as "FCC" or "Fast C Contiguous". * The result will often but not always be in NFC. * The result will conform to FCD which is useful for processing. * Not a standard Unicode normalization form. * For details see http://www.unicode.org/notes/tn5/#FCC * @stable ICU 4.4 */ UNORM2_COMPOSE_CONTIGUOUS } UNormalization2Mode; /** * Result values for normalization quick check functions. * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms * @stable ICU 2.0 */ typedef enum UNormalizationCheckResult { /** * The input string is not in the normalization form. * @stable ICU 2.0 */ UNORM_NO, /** * The input string is in the normalization form. * @stable ICU 2.0 */ UNORM_YES, /** * The input string may or may not be in the normalization form. * This value is only returned for composition forms like NFC and FCC, * when a backward-combining character is found for which the surrounding text * would have to be analyzed further. * @stable ICU 2.0 */ UNORM_MAYBE } UNormalizationCheckResult; /** * Opaque C service object type for the new normalization API. * @stable ICU 4.4 */ struct UNormalizer2; typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. @stable ICU 4.4 */ #if !UCONFIG_NO_NORMALIZATION /** * Returns a UNormalizer2 instance for Unicode NFC normalization. * Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_CAPI const UNormalizer2 * U_EXPORT2 unorm2_getNFCInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance for Unicode NFD normalization. * Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_CAPI const UNormalizer2 * U_EXPORT2 unorm2_getNFDInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance for Unicode NFKC normalization. * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_CAPI const UNormalizer2 * U_EXPORT2 unorm2_getNFKCInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance for Unicode NFKD normalization. * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_CAPI const UNormalizer2 * U_EXPORT2 unorm2_getNFKDInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance for Unicode NFKC_Casefold normalization. * Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_CAPI const UNormalizer2 * U_EXPORT2 unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance which uses the specified data file * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) * and which composes or decomposes text according to the specified mode. * Returns an unmodifiable singleton instance. Do not delete it. * * Use packageName=NULL for data files that are part of ICU's own data. * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. * * @param packageName NULL for ICU built-in data, otherwise application data package name * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file * @param mode normalization mode (compose or decompose etc.) * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested UNormalizer2, if successful * @stable ICU 4.4 */ U_CAPI const UNormalizer2 * U_EXPORT2 unorm2_getInstance(const char *packageName, const char *name, UNormalization2Mode mode, UErrorCode *pErrorCode); /** * Constructs a filtered normalizer wrapping any UNormalizer2 instance * and a filter set. * Both are aliased and must not be modified or deleted while this object * is used. * The filter set should be frozen; otherwise the performance will suffer greatly. * @param norm2 wrapped UNormalizer2 instance * @param filterSet USet which determines the characters to be normalized * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested UNormalizer2, if successful * @stable ICU 4.4 */ U_CAPI UNormalizer2 * U_EXPORT2 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode); /** * Closes a UNormalizer2 instance from unorm2_openFiltered(). * Do not close instances from unorm2_getInstance()! * @param norm2 UNormalizer2 instance to be closed * @stable ICU 4.4 */ U_CAPI void U_EXPORT2 unorm2_close(UNormalizer2 *norm2); /** * Writes the normalized form of the source string to the destination string * (replacing its contents) and returns the length of the destination string. * The source and destination strings must be different buffers. * @param norm2 UNormalizer2 instance * @param src source string * @param length length of the source string, or -1 if NUL-terminated * @param dest destination string; its contents is replaced with normalized src * @param capacity number of UChars that can be written to dest * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return dest * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 unorm2_normalize(const UNormalizer2 *norm2, const UChar *src, int32_t length, UChar *dest, int32_t capacity, UErrorCode *pErrorCode); /** * Appends the normalized form of the second string to the first string * (merging them at the boundary) and returns the length of the first string. * The result is normalized if the first string was normalized. * The first and second strings must be different buffers. * @param norm2 UNormalizer2 instance * @param first string, should be normalized * @param firstLength length of the first string, or -1 if NUL-terminated * @param firstCapacity number of UChars that can be written to first * @param second string, will be normalized * @param secondLength length of the source string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return first * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2, UChar *first, int32_t firstLength, int32_t firstCapacity, const UChar *second, int32_t secondLength, UErrorCode *pErrorCode); /** * Appends the second string to the first string * (merging them at the boundary) and returns the length of the first string. * The result is normalized if both the strings were normalized. * The first and second strings must be different buffers. * @param norm2 UNormalizer2 instance * @param first string, should be normalized * @param firstLength length of the first string, or -1 if NUL-terminated * @param firstCapacity number of UChars that can be written to first * @param second string, should be normalized * @param secondLength length of the source string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return first * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 unorm2_append(const UNormalizer2 *norm2, UChar *first, int32_t firstLength, int32_t firstCapacity, const UChar *second, int32_t secondLength, UErrorCode *pErrorCode); /** * Gets the decomposition mapping of c. * Roughly equivalent to normalizing the String form of c * on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function * returns a negative value and does not write a string * if c does not have a decomposition mapping in this instance's data. * This function is independent of the mode of the UNormalizer2. * @param norm2 UNormalizer2 instance * @param c code point * @param decomposition String buffer which will be set to c's * decomposition mapping, if there is one. * @param capacity number of UChars that can be written to decomposition * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the non-negative length of c's decomposition, if there is one; otherwise a negative value * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 unorm2_getDecomposition(const UNormalizer2 *norm2, UChar32 c, UChar *decomposition, int32_t capacity, UErrorCode *pErrorCode); /** * Gets the raw decomposition mapping of c. * * This is similar to the unorm2_getDecomposition() function but returns the * raw decomposition mapping as specified in UnicodeData.txt or * (for custom data) in the mapping files processed by the gennorm2 tool. * By contrast, unorm2_getDecomposition() returns the processed, * recursively-decomposed version of this mapping. * * When used on a standard NFKC Normalizer2 instance, * unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. * * When used on a standard NFC Normalizer2 instance, * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); * in this case, the result contains either one or two code points (=1..4 UChars). * * This function is independent of the mode of the UNormalizer2. * @param norm2 UNormalizer2 instance * @param c code point * @param decomposition String buffer which will be set to c's * raw decomposition mapping, if there is one. * @param capacity number of UChars that can be written to decomposition * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value * @stable ICU 49 */ U_CAPI int32_t U_EXPORT2 unorm2_getRawDecomposition(const UNormalizer2 *norm2, UChar32 c, UChar *decomposition, int32_t capacity, UErrorCode *pErrorCode); /** * Performs pairwise composition of a & b and returns the composite if there is one. * * Returns a composite code point c only if c has a two-way mapping to a+b. * In standard Unicode normalization, this means that * c has a canonical decomposition to a+b * and c does not have the Full_Composition_Exclusion property. * * This function is independent of the mode of the UNormalizer2. * @param norm2 UNormalizer2 instance * @param a A (normalization starter) code point. * @param b Another code point. * @return The non-negative composite code point if there is one; otherwise a negative value. * @stable ICU 49 */ U_CAPI UChar32 U_EXPORT2 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b); /** * Gets the combining class of c. * The default implementation returns 0 * but all standard implementations return the Unicode Canonical_Combining_Class value. * @param norm2 UNormalizer2 instance * @param c code point * @return c's combining class * @stable ICU 49 */ U_CAPI uint8_t U_EXPORT2 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c); /** * Tests if the string is normalized. * Internally, in cases where the quickCheck() method would return "maybe" * (which is only possible for the two COMPOSE modes) this method * resolves to "yes" or "no" to provide a definitive result, * at the cost of doing more work in those cases. * @param norm2 UNormalizer2 instance * @param s input string * @param length length of the string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return true if s is normalized * @stable ICU 4.4 */ U_CAPI UBool U_EXPORT2 unorm2_isNormalized(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode); /** * Tests if the string is normalized. * For the two COMPOSE modes, the result could be "maybe" in cases that * would take a little more work to resolve definitively. * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster * combination of quick check + normalization, to avoid * re-checking the "yes" prefix. * @param norm2 UNormalizer2 instance * @param s input string * @param length length of the string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return UNormalizationCheckResult * @stable ICU 4.4 */ U_CAPI UNormalizationCheckResult U_EXPORT2 unorm2_quickCheck(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode); /** * Returns the end of the normalized substring of the input string. * In other words, with end=spanQuickCheckYes(s, ec); * the substring UnicodeString(s, 0, end) * will pass the quick check with a "yes" result. * * The returned end index is usually one or more characters before the * "no" or "maybe" character: The end index is at a normalization boundary. * (See the class documentation for more about normalization boundaries.) * * When the goal is a normalized string and most input strings are expected * to be normalized already, then call this method, * and if it returns a prefix shorter than the input string, * copy that prefix and use normalizeSecondAndAppend() for the remainder. * @param norm2 UNormalizer2 instance * @param s input string * @param length length of the string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return "yes" span end index * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode); /** * Tests if the character always has a normalization boundary before it, * regardless of context. * For details see the Normalizer2 base class documentation. * @param norm2 UNormalizer2 instance * @param c character to test * @return true if c has a normalization boundary before it * @stable ICU 4.4 */ U_CAPI UBool U_EXPORT2 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c); /** * Tests if the character always has a normalization boundary after it, * regardless of context. * For details see the Normalizer2 base class documentation. * @param norm2 UNormalizer2 instance * @param c character to test * @return true if c has a normalization boundary after it * @stable ICU 4.4 */ U_CAPI UBool U_EXPORT2 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c); /** * Tests if the character is normalization-inert. * For details see the Normalizer2 base class documentation. * @param norm2 UNormalizer2 instance * @param c character to test * @return true if c is normalization-inert * @stable ICU 4.4 */ U_CAPI UBool U_EXPORT2 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c); /** * Compares two strings for canonical equivalence. * Further options include case-insensitive comparison and * code point order (as opposed to code unit order). * * Canonical equivalence between two strings is defined as their normalized * forms (NFD or NFC) being identical. * This function compares strings incrementally instead of normalizing * (and optionally case-folding) both strings entirely, * improving performance significantly. * * Bulk normalization is only necessary if the strings do not fulfill the FCD * conditions. Only in this case, and only if the strings are relatively long, * is memory allocated temporarily. * For FCD strings and short non-FCD strings there is no memory allocation. * * Semantically, this is equivalent to * strcmp[CodePointOrder](NFD(foldCase(NFD(s1))), NFD(foldCase(NFD(s2)))) * where code point order and foldCase are all optional. * * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match * the case folding must be performed first, then the normalization. * * @param s1 First source string. * @param length1 Length of first source string, or -1 if NUL-terminated. * * @param s2 Second source string. * @param length2 Length of second source string, or -1 if NUL-terminated. * * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Case-sensitive comparison in code unit order, and the input strings * are quick-checked for FCD. * * - UNORM_INPUT_IS_FCD * Set if the caller knows that both s1 and s2 fulfill the FCD conditions. * If not set, the function will quickCheck for FCD * and normalize if necessary. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_COMPARE_IGNORE_CASE * Set to compare strings case-insensitively using case folding, * instead of case-sensitively. * If set, then the following case folding options are used. * * - Options as used with case-insensitive comparisons, currently: * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * (see u_strCaseCompare for details) * * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT * * @param pErrorCode ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * @return <0 or 0 or >0 as usual for string comparisons * * @see unorm_normalize * @see UNORM_FCD * @see u_strCompare * @see u_strCaseCompare * * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 unorm_compare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode); #endif /* !UCONFIG_NO_NORMALIZATION */ #endif /* __UNORM2_H__ */ // unorm.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (c) 1996-2016, International Business Machines Corporation * and others. All Rights Reserved. ******************************************************************************* * File unorm.h * * Created by: Vladimir Weinstein 12052000 * * Modification history : * * Date Name Description * 02/01/01 synwee Added normalization quickcheck enum and method. */ #ifndef UNORM_H #define UNORM_H #if !UCONFIG_NO_NORMALIZATION /** * \file * \brief C API: Unicode Normalization * * Old Unicode normalization API. * * This API has been replaced by the unorm2.h API and is only available * for backward compatibility. The functions here simply delegate to the * unorm2.h functions, for example unorm2_getInstance() and unorm2_normalize(). * There is one exception: The new API does not provide a replacement for unorm_compare(). * Its declaration has been moved to unorm2.h. * * unorm_normalize transforms Unicode text into an equivalent composed or * decomposed form, allowing for easier sorting and searching of text. * unorm_normalize supports the standard normalization forms described in * * Unicode Standard Annex #15: Unicode Normalization Forms. * * Characters with accents or other adornments can be encoded in * several different ways in Unicode. For example, take the character A-acute. * In Unicode, this can be encoded as a single character (the * "composed" form): * * \code * 00C1 LATIN CAPITAL LETTER A WITH ACUTE * \endcode * * or as two separate characters (the "decomposed" form): * * \code * 0041 LATIN CAPITAL LETTER A * 0301 COMBINING ACUTE ACCENT * \endcode * * To a user of your program, however, both of these sequences should be * treated as the same "user-level" character "A with acute accent". When you are searching or * comparing text, you must ensure that these two sequences are treated * equivalently. In addition, you must handle characters with more than one * accent. Sometimes the order of a character's combining accents is * significant, while in other cases accent sequences in different orders are * really equivalent. * * Similarly, the string "ffi" can be encoded as three separate letters: * * \code * 0066 LATIN SMALL LETTER F * 0066 LATIN SMALL LETTER F * 0069 LATIN SMALL LETTER I * \endcode * * or as the single character * * \code * FB03 LATIN SMALL LIGATURE FFI * \endcode * * The ffi ligature is not a distinct semantic character, and strictly speaking * it shouldn't be in Unicode at all, but it was included for compatibility * with existing character sets that already provided it. The Unicode standard * identifies such characters by giving them "compatibility" decompositions * into the corresponding semantic characters. When sorting and searching, you * will often want to use these mappings. * * unorm_normalize helps solve these problems by transforming text into the * canonical composed and decomposed forms as shown in the first example above. * In addition, you can have it perform compatibility decompositions so that * you can treat compatibility characters the same as their equivalents. * Finally, unorm_normalize rearranges accents into the proper canonical * order, so that you do not have to worry about accent rearrangement on your * own. * * Form FCD, "Fast C or D", is also designed for collation. * It allows to work on strings that are not necessarily normalized * with an algorithm (like in collation) that works under "canonical closure", i.e., it treats precomposed * characters and their decomposed equivalents the same. * * It is not a normalization form because it does not provide for uniqueness of representation. Multiple strings * may be canonically equivalent (their NFDs are identical) and may all conform to FCD without being identical * themselves. * * The form is defined such that the "raw decomposition", the recursive canonical decomposition of each character, * results in a string that is canonically ordered. This means that precomposed characters are allowed for as long * as their decompositions do not need canonical reordering. * * Its advantage for a process like collation is that all NFD and most NFC texts - and many unnormalized texts - * already conform to FCD and do not need to be normalized (NFD) for such a process. The FCD quick check will * return UNORM_YES for most strings in practice. * * unorm_normalize(UNORM_FCD) may be implemented with UNORM_NFD. * * For more details on FCD see the collation design document: * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/collation/ICU_collation_design.htm * * ICU collation performs either NFD or FCD normalization automatically if normalization * is turned on for the collator object. * Beyond collation and string search, normalized strings may be useful for string equivalence comparisons, * transliteration/transcription, unique representations, etc. * * The W3C generally recommends to exchange texts in NFC. * Note also that most legacy character encodings use only precomposed forms and often do not * encode any combining marks by themselves. For conversion to such character encodings the * Unicode text needs to be normalized to NFC. * For more usage examples, see the Unicode Standard Annex. */ // Do not conditionalize the following enum with #ifndef U_HIDE_DEPRECATED_API, // it is needed for layout of Normalizer object. #ifndef U_FORCE_HIDE_DEPRECATED_API /** * Constants for normalization modes. * @deprecated ICU 56 Use unorm2.h instead. */ typedef enum { /** No decomposition/composition. @deprecated ICU 56 Use unorm2.h instead. */ UNORM_NONE = 1, /** Canonical decomposition. @deprecated ICU 56 Use unorm2.h instead. */ UNORM_NFD = 2, /** Compatibility decomposition. @deprecated ICU 56 Use unorm2.h instead. */ UNORM_NFKD = 3, /** Canonical decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */ UNORM_NFC = 4, /** Default normalization. @deprecated ICU 56 Use unorm2.h instead. */ UNORM_DEFAULT = UNORM_NFC, /** Compatibility decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */ UNORM_NFKC =5, /** "Fast C or D" form. @deprecated ICU 56 Use unorm2.h instead. */ UNORM_FCD = 6, /** One more than the highest normalization mode constant. @deprecated ICU 56 Use unorm2.h instead. */ UNORM_MODE_COUNT } UNormalizationMode; #endif // U_FORCE_HIDE_DEPRECATED_API #endif /* #if !UCONFIG_NO_NORMALIZATION */ #endif // ucnvsel.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2008-2011, International Business Machines * Corporation, Google and others. All Rights Reserved. * ******************************************************************************* */ /* * Author : eldawy@google.com (Mohamed Eldawy) * ucnvsel.h * * Purpose: To generate a list of encodings capable of handling * a given Unicode text * * Started 09-April-2008 */ #ifndef __ICU_UCNV_SEL_H__ #define __ICU_UCNV_SEL_H__ #if !UCONFIG_NO_CONVERSION /** * \file * \brief C API: Encoding/charset encoding selector * * A converter selector is built with a set of encoding/charset names * and given an input string returns the set of names of the * corresponding converters which can convert the string. * * A converter selector can be serialized into a buffer and reopened * from the serialized form. */ /** * @{ * Typedef for selector data structure. */ struct UConverterSelector; typedef struct UConverterSelector UConverterSelector; /** @} */ /** * Open a selector. * If converterListSize is 0, build for all available converters. * If excludedCodePoints is NULL, don't exclude any code points. * * @param converterList a pointer to encoding names needed to be involved. * Can be NULL if converterListSize==0. * The list and the names will be cloned, and the caller * retains ownership of the original. * @param converterListSize number of encodings in above list. * If 0, builds a selector for all available converters. * @param excludedCodePoints a set of code points to be excluded from consideration. * That is, excluded code points in a string do not change * the selection result. (They might be handled by a callback.) * Use NULL to exclude nothing. * @param whichSet what converter set to use? Use this to determine whether * to consider only roundtrip mappings or also fallbacks. * @param status an in/out ICU UErrorCode * @return the new selector * * @stable ICU 4.2 */ U_CAPI UConverterSelector* U_EXPORT2 ucnvsel_open(const char* const* converterList, int32_t converterListSize, const USet* excludedCodePoints, const UConverterUnicodeSet whichSet, UErrorCode* status); /** * Closes a selector. * If any Enumerations were returned by ucnv_select*, they become invalid. * They can be closed before or after calling ucnv_closeSelector, * but should never be used after the selector is closed. * * @see ucnv_selectForString * @see ucnv_selectForUTF8 * * @param sel selector to close * * @stable ICU 4.2 */ U_CAPI void U_EXPORT2 ucnvsel_close(UConverterSelector *sel); /** * Open a selector from its serialized form. * The buffer must remain valid and unchanged for the lifetime of the selector. * This is much faster than creating a selector from scratch. * Using a serialized form from a different machine (endianness/charset) is supported. * * @param buffer pointer to the serialized form of a converter selector; * must be 32-bit-aligned * @param length the capacity of this buffer (can be equal to or larger than * the actual data length) * @param status an in/out ICU UErrorCode * @return the new selector * * @stable ICU 4.2 */ U_CAPI UConverterSelector* U_EXPORT2 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status); /** * Serialize a selector into a linear buffer. * The serialized form is portable to different machines. * * @param sel selector to consider * @param buffer pointer to 32-bit-aligned memory to be filled with the * serialized form of this converter selector * @param bufferCapacity the capacity of this buffer * @param status an in/out ICU UErrorCode * @return the required buffer capacity to hold serialize data (even if the call fails * with a U_BUFFER_OVERFLOW_ERROR, it will return the required capacity) * * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 ucnvsel_serialize(const UConverterSelector* sel, void* buffer, int32_t bufferCapacity, UErrorCode* status); /** * Select converters that can map all characters in a UTF-16 string, * ignoring the excluded code points. * * @param sel a selector * @param s UTF-16 string * @param length length of the string, or -1 if NUL-terminated * @param status an in/out ICU UErrorCode * @return an enumeration containing encoding names. * The returned encoding names and their order will be the same as * supplied when building the selector. * * @stable ICU 4.2 */ U_CAPI UEnumeration * U_EXPORT2 ucnvsel_selectForString(const UConverterSelector* sel, const UChar *s, int32_t length, UErrorCode *status); /** * Select converters that can map all characters in a UTF-8 string, * ignoring the excluded code points. * * @param sel a selector * @param s UTF-8 string * @param length length of the string, or -1 if NUL-terminated * @param status an in/out ICU UErrorCode * @return an enumeration containing encoding names. * The returned encoding names and their order will be the same as * supplied when building the selector. * * @stable ICU 4.2 */ U_CAPI UEnumeration * U_EXPORT2 ucnvsel_selectForUTF8(const UConverterSelector* sel, const char *s, int32_t length, UErrorCode *status); #endif /* !UCONFIG_NO_CONVERSION */ #endif /* __ICU_UCNV_SEL_H__ */ // putil.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 1997-2014, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * FILE NAME : putil.h * * Date Name Description * 05/14/98 nos Creation (content moved here from utypes.h). * 06/17/99 erm Added IEEE_754 * 07/22/98 stephen Added IEEEremainder, max, min, trunc * 08/13/98 stephen Added isNegativeInfinity, isPositiveInfinity * 08/24/98 stephen Added longBitsFromDouble * 03/02/99 stephen Removed openFile(). Added AS400 support. * 04/15/99 stephen Converted to C * 11/15/99 helena Integrated S/390 changes for IEEE support. * 01/11/00 helena Added u_getVersion. ****************************************************************************** */ #ifndef PUTIL_H #define PUTIL_H /** * \file * \brief C API: Platform Utilities */ /*==========================================================================*/ /* Platform utilities */ /*==========================================================================*/ /** * Platform utilities isolates the platform dependencies of the * library. For each platform which this code is ported to, these * functions may have to be re-implemented. */ /** @} */ /** * Convert char characters to UChar characters. * This utility function is useful only for "invariant characters" * that are encoded in the platform default encoding. * They are a small, constant subset of the encoding and include * just the latin letters, digits, and some punctuation. * For details, see U_CHARSET_FAMILY. * * @param cs Input string, points to length * character bytes from a subset of the platform encoding. * @param us Output string, points to memory for length * Unicode characters. * @param length The number of characters to convert; this may * include the terminating NUL. * * @see U_CHARSET_FAMILY * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 u_charsToUChars(const char *cs, UChar *us, int32_t length); /** * Convert UChar characters to char characters. * This utility function is useful only for "invariant characters" * that can be encoded in the platform default encoding. * They are a small, constant subset of the encoding and include * just the latin letters, digits, and some punctuation. * For details, see U_CHARSET_FAMILY. * * @param us Input string, points to length * Unicode characters that can be encoded with the * codepage-invariant subset of the platform encoding. * @param cs Output string, points to memory for length * character bytes. * @param length The number of characters to convert; this may * include the terminating NUL. * * @see U_CHARSET_FAMILY * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 u_UCharsToChars(const UChar *us, char *cs, int32_t length); #endif // ustring.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1998-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File ustring.h * * Modification History: * * Date Name Description * 12/07/98 bertrand Creation. ****************************************************************************** */ #ifndef USTRING_H #define USTRING_H /** * \def UBRK_TYPEDEF_UBREAK_ITERATOR * @internal */ #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR # define UBRK_TYPEDEF_UBREAK_ITERATOR /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/ typedef struct UBreakIterator UBreakIterator; #endif /** * \file * \brief C API: Unicode string handling functions * * These C API functions provide general Unicode string handling. * * Some functions are equivalent in name, signature, and behavior to the ANSI C * functions. (For example, they do not check for bad arguments like NULL string pointers.) * In some cases, only the thread-safe variant of such a function is implemented here * (see u_strtok_r()). * * Other functions provide more Unicode-specific functionality like locale-specific * upper/lower-casing and string comparison in code point order. * * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units. * UTF-16 encodes each Unicode code point with either one or two UChar code units. * (This is the default form of Unicode, and a forward-compatible extension of the original, * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0 * in 1996.) * * Some APIs accept a 32-bit UChar32 value for a single code point. * * ICU also handles 16-bit Unicode text with unpaired surrogates. * Such text is not well-formed UTF-16. * Code-point-related functions treat unpaired surrogates as surrogate code points, * i.e., as separate units. * * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings), * it is much more efficient even for random access because the code unit values * for single-unit characters vs. lead units vs. trail units are completely disjoint. * This means that it is easy to determine character (code point) boundaries from * random offsets in the string. * * Unicode (UTF-16) string processing is optimized for the single-unit case. * Although it is important to support supplementary characters * (which use pairs of lead/trail code units called "surrogates"), * their occurrence is rare. Almost all characters in modern use require only * a single UChar code unit (i.e., their code point values are <=0xffff). * * For more details see the User Guide Strings chapter (https://unicode-org.github.io/icu/userguide/strings/). * For a discussion of the handling of unpaired surrogates see also * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18. */ /** * \defgroup ustring_ustrlen String Length * \ingroup ustring_strlen */ /*@{*/ /** * Determine the length of an array of UChar. * * @param s The array of UChars, NULL (U+0000) terminated. * @return The number of UChars in chars, minus the terminator. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strlen(const UChar *s); /*@}*/ /** * Count Unicode code points in the length UChar code units of the string. * A code point may occupy either one or two UChar code units. * Counting code points involves reading all code units. * * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h). * * @param s The input string. * @param length The number of UChar code units to be checked, or -1 to count all * code points before the first NUL (U+0000). * @return The number of code points in the specified code units. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_countChar32(const UChar *s, int32_t length); /** * Check if the string contains more Unicode code points than a certain number. * This is more efficient than counting all code points in the entire string * and comparing that number with a threshold. * This function may not need to scan the string at all if the length is known * (not -1 for NUL-termination) and falls within a certain range, and * never needs to count more than 'number+1' code points. * Logically equivalent to (u_countChar32(s, length)>number). * A Unicode code point may occupy either one or two UChar code units. * * @param s The input string. * @param length The length of the string, or -1 if it is NUL-terminated. * @param number The number of code points in the string is compared against * the 'number' parameter. * @return Boolean value for whether the string contains more Unicode code points * than 'number'. Same as (u_countChar32(s, length)>number). * @stable ICU 2.4 */ U_CAPI UBool U_EXPORT2 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number); /** * Concatenate two ustrings. Appends a copy of src, * including the null terminator, to dst. The initial copied * character from src overwrites the null terminator in dst. * * @param dst The destination string. * @param src The source string. * @return A pointer to dst. * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_strcat(UChar *dst, const UChar *src); /** * Concatenate two ustrings. * Appends at most n characters from src to dst. * Adds a terminating NUL. * If src is too long, then only n-1 characters will be copied * before the terminating NUL. * If n<=0 then dst is not modified. * * @param dst The destination string. * @param src The source string (can be NULL/invalid if n<=0). * @param n The maximum number of characters to append; no-op if <=0. * @return A pointer to dst. * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_strncat(UChar *dst, const UChar *src, int32_t n); /** * Find the first occurrence of a substring in a string. * The substring is found at code point boundaries. * That means that if the substring begins with * a trail surrogate or ends with a lead surrogate, * then it is found only if these surrogates stand alone in the text. * Otherwise, the substring edge units would be matched against * halves of surrogate pairs. * * @param s The string to search (NUL-terminated). * @param substring The substring to find (NUL-terminated). * @return A pointer to the first occurrence of substring in s, * or s itself if the substring is empty, * or NULL if substring is not in s. * @stable ICU 2.0 * * @see u_strrstr * @see u_strFindFirst * @see u_strFindLast */ U_CAPI UChar * U_EXPORT2 u_strstr(const UChar *s, const UChar *substring); /** * Find the first occurrence of a substring in a string. * The substring is found at code point boundaries. * That means that if the substring begins with * a trail surrogate or ends with a lead surrogate, * then it is found only if these surrogates stand alone in the text. * Otherwise, the substring edge units would be matched against * halves of surrogate pairs. * * @param s The string to search. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated. * @param substring The substring to find (NUL-terminated). * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated. * @return A pointer to the first occurrence of substring in s, * or s itself if the substring is empty, * or NULL if substring is not in s. * @stable ICU 2.4 * * @see u_strstr * @see u_strFindLast */ U_CAPI UChar * U_EXPORT2 u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength); /** * Find the first occurrence of a BMP code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (NUL-terminated). * @param c The BMP code point to find. * @return A pointer to the first occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.0 * * @see u_strchr32 * @see u_memchr * @see u_strstr * @see u_strFindFirst */ U_CAPI UChar * U_EXPORT2 u_strchr(const UChar *s, UChar c); /** * Find the first occurrence of a code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (NUL-terminated). * @param c The code point to find. * @return A pointer to the first occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.0 * * @see u_strchr * @see u_memchr32 * @see u_strstr * @see u_strFindFirst */ U_CAPI UChar * U_EXPORT2 u_strchr32(const UChar *s, UChar32 c); /** * Find the last occurrence of a substring in a string. * The substring is found at code point boundaries. * That means that if the substring begins with * a trail surrogate or ends with a lead surrogate, * then it is found only if these surrogates stand alone in the text. * Otherwise, the substring edge units would be matched against * halves of surrogate pairs. * * @param s The string to search (NUL-terminated). * @param substring The substring to find (NUL-terminated). * @return A pointer to the last occurrence of substring in s, * or s itself if the substring is empty, * or NULL if substring is not in s. * @stable ICU 2.4 * * @see u_strstr * @see u_strFindFirst * @see u_strFindLast */ U_CAPI UChar * U_EXPORT2 u_strrstr(const UChar *s, const UChar *substring); /** * Find the last occurrence of a substring in a string. * The substring is found at code point boundaries. * That means that if the substring begins with * a trail surrogate or ends with a lead surrogate, * then it is found only if these surrogates stand alone in the text. * Otherwise, the substring edge units would be matched against * halves of surrogate pairs. * * @param s The string to search. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated. * @param substring The substring to find (NUL-terminated). * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated. * @return A pointer to the last occurrence of substring in s, * or s itself if the substring is empty, * or NULL if substring is not in s. * @stable ICU 2.4 * * @see u_strstr * @see u_strFindLast */ U_CAPI UChar * U_EXPORT2 u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength); /** * Find the last occurrence of a BMP code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (NUL-terminated). * @param c The BMP code point to find. * @return A pointer to the last occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.4 * * @see u_strrchr32 * @see u_memrchr * @see u_strrstr * @see u_strFindLast */ U_CAPI UChar * U_EXPORT2 u_strrchr(const UChar *s, UChar c); /** * Find the last occurrence of a code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (NUL-terminated). * @param c The code point to find. * @return A pointer to the last occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.4 * * @see u_strrchr * @see u_memchr32 * @see u_strrstr * @see u_strFindLast */ U_CAPI UChar * U_EXPORT2 u_strrchr32(const UChar *s, UChar32 c); /** * Locates the first occurrence in the string string of any of the characters * in the string matchSet. * Works just like C's strpbrk but with Unicode. * * @param string The string in which to search, NUL-terminated. * @param matchSet A NUL-terminated string defining a set of code points * for which to search in the text string. * @return A pointer to the character in string that matches one of the * characters in matchSet, or NULL if no such character is found. * @stable ICU 2.0 */ U_CAPI UChar * U_EXPORT2 u_strpbrk(const UChar *string, const UChar *matchSet); /** * Returns the number of consecutive characters in string, * beginning with the first, that do not occur somewhere in matchSet. * Works just like C's strcspn but with Unicode. * * @param string The string in which to search, NUL-terminated. * @param matchSet A NUL-terminated string defining a set of code points * for which to search in the text string. * @return The number of initial characters in string that do not * occur in matchSet. * @see u_strspn * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strcspn(const UChar *string, const UChar *matchSet); /** * Returns the number of consecutive characters in string, * beginning with the first, that occur somewhere in matchSet. * Works just like C's strspn but with Unicode. * * @param string The string in which to search, NUL-terminated. * @param matchSet A NUL-terminated string defining a set of code points * for which to search in the text string. * @return The number of initial characters in string that do * occur in matchSet. * @see u_strcspn * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strspn(const UChar *string, const UChar *matchSet); /** * The string tokenizer API allows an application to break a string into * tokens. Unlike strtok(), the saveState (the current pointer within the * original string) is maintained in saveState. In the first call, the * argument src is a pointer to the string. In subsequent calls to * return successive tokens of that string, src must be specified as * NULL. The value saveState is set by this function to maintain the * function's position within the string, and on each subsequent call * you must give this argument the same variable. This function does * handle surrogate pairs. This function is similar to the strtok_r() * the POSIX Threads Extension (1003.1c-1995) version. * * @param src String containing token(s). This string will be modified. * After the first call to u_strtok_r(), this argument must * be NULL to get to the next token. * @param delim Set of delimiter characters (Unicode code points). * @param saveState The current pointer within the original string, * which is set by this function. The saveState * parameter should the address of a local variable of type * UChar *. (i.e. defined "UChar *myLocalSaveState" and use * &myLocalSaveState for this parameter). * @return A pointer to the next token found in src, or NULL * when there are no more tokens. * @stable ICU 2.0 */ U_CAPI UChar * U_EXPORT2 u_strtok_r(UChar *src, const UChar *delim, UChar **saveState); /** * Compare two Unicode strings for bitwise equality (code unit order). * * @param s1 A string to compare. * @param s2 A string to compare. * @return 0 if s1 and s2 are bitwise equal; a negative * value if s1 is bitwise less than s2,; a positive * value if s1 is bitwise greater than s2. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strcmp(const UChar *s1, const UChar *s2); /** * Compare two Unicode strings in code point order. * See u_strCompare for details. * * @param s1 A string to compare. * @param s2 A string to compare. * @return a negative/zero/positive integer corresponding to whether * the first string is less than/equal to/greater than the second one * in code point order * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2); /** * Compare two Unicode strings (binary order). * * The comparison can be done in code unit order or in code point order. * They differ only in UTF-16 when * comparing supplementary code points (U+10000..U+10ffff) * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff). * In code unit order, high BMP code points sort after supplementary code points * because they are stored as pairs of surrogates which are at U+d800..U+dfff. * * This functions works with strings of different explicitly specified lengths * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. * NUL-terminated strings are possible with length arguments of -1. * * @param s1 First source string. * @param length1 Length of first source string, or -1 if NUL-terminated. * * @param s2 Second source string. * @param length2 Length of second source string, or -1 if NUL-terminated. * * @param codePointOrder Choose between code unit order (false) * and code point order (true). * * @return <0 or 0 or >0 as usual for string comparisons * * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 u_strCompare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, UBool codePointOrder); /** * Compare two Unicode strings (binary order) * as presented by UCharIterator objects. * Works otherwise just like u_strCompare(). * * Both iterators are reset to their start positions. * When the function returns, it is undefined where the iterators * have stopped. * * @param iter1 First source string iterator. * @param iter2 Second source string iterator. * @param codePointOrder Choose between code unit order (false) * and code point order (true). * * @return <0 or 0 or >0 as usual for string comparisons * * @see u_strCompare * * @stable ICU 2.6 */ U_CAPI int32_t U_EXPORT2 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder); /** * Compare two strings case-insensitively using full case folding. * This is equivalent to * u_strCompare(u_strFoldCase(s1, options), * u_strFoldCase(s2, options), * (options&U_COMPARE_CODE_POINT_ORDER)!=0). * * The comparison can be done in UTF-16 code unit order or in code point order. * They differ only when comparing supplementary code points (U+10000..U+10ffff) * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff). * In code unit order, high BMP code points sort after supplementary code points * because they are stored as pairs of surrogates which are at U+d800..U+dfff. * * This functions works with strings of different explicitly specified lengths * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. * NUL-terminated strings are possible with length arguments of -1. * * @param s1 First source string. * @param length1 Length of first source string, or -1 if NUL-terminated. * * @param s2 Second source string. * @param length2 Length of second source string, or -1 if NUL-terminated. * * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Comparison in code unit order with default case folding. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @return <0 or 0 or >0 as usual for string comparisons * * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 u_strCaseCompare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode); /** * Compare two ustrings for bitwise equality. * Compares at most n characters. * * @param ucs1 A string to compare (can be NULL/invalid if n<=0). * @param ucs2 A string to compare (can be NULL/invalid if n<=0). * @param n The maximum number of characters to compare; always returns 0 if n<=0. * @return 0 if s1 and s2 are bitwise equal; a negative * value if s1 is bitwise less than s2; a positive * value if s1 is bitwise greater than s2. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strncmp(const UChar *ucs1, const UChar *ucs2, int32_t n); /** * Compare two Unicode strings in code point order. * This is different in UTF-16 from u_strncmp() if supplementary characters are present. * For details, see u_strCompare(). * * @param s1 A string to compare. * @param s2 A string to compare. * @param n The maximum number of characters to compare. * @return a negative/zero/positive integer corresponding to whether * the first string is less than/equal to/greater than the second one * in code point order * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n); /** * Compare two strings case-insensitively using full case folding. * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)). * * @param s1 A string to compare. * @param s2 A string to compare. * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Comparison in code unit order with default case folding. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * * @return A negative, zero, or positive integer indicating the comparison result. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options); /** * Compare two strings case-insensitively using full case folding. * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options), * u_strFoldCase(s2, at most n, options)). * * @param s1 A string to compare. * @param s2 A string to compare. * @param n The maximum number of characters each string to case-fold and then compare. * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Comparison in code unit order with default case folding. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * * @return A negative, zero, or positive integer indicating the comparison result. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options); /** * Compare two strings case-insensitively using full case folding. * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options), * u_strFoldCase(s2, n, options)). * * @param s1 A string to compare. * @param s2 A string to compare. * @param length The number of characters in each string to case-fold and then compare. * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Comparison in code unit order with default case folding. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * * @return A negative, zero, or positive integer indicating the comparison result. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options); /** * Copy a ustring. Adds a null terminator. * * @param dst The destination string. * @param src The source string. * @return A pointer to dst. * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_strcpy(UChar *dst, const UChar *src); /** * Copy a ustring. * Copies at most n characters. The result will be null terminated * if the length of src is less than n. * * @param dst The destination string. * @param src The source string (can be NULL/invalid if n<=0). * @param n The maximum number of characters to copy; no-op if <=0. * @return A pointer to dst. * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_strncpy(UChar *dst, const UChar *src, int32_t n); #if !UCONFIG_NO_CONVERSION /** * Copy a byte string encoded in the default codepage to a ustring. * Adds a null terminator. * Performs a host byte to UChar conversion * * @param dst The destination string. * @param src The source string. * @return A pointer to dst. * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_uastrcpy(UChar *dst, const char *src ); /** * Copy a byte string encoded in the default codepage to a ustring. * Copies at most n characters. The result will be null terminated * if the length of src is less than n. * Performs a host byte to UChar conversion * * @param dst The destination string. * @param src The source string. * @param n The maximum number of characters to copy. * @return A pointer to dst. * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_uastrncpy(UChar *dst, const char *src, int32_t n); /** * Copy ustring to a byte string encoded in the default codepage. * Adds a null terminator. * Performs a UChar to host byte conversion * * @param dst The destination string. * @param src The source string. * @return A pointer to dst. * @stable ICU 2.0 */ U_CAPI char* U_EXPORT2 u_austrcpy(char *dst, const UChar *src ); /** * Copy ustring to a byte string encoded in the default codepage. * Copies at most n characters. The result will be null terminated * if the length of src is less than n. * Performs a UChar to host byte conversion * * @param dst The destination string. * @param src The source string. * @param n The maximum number of characters to copy. * @return A pointer to dst. * @stable ICU 2.0 */ U_CAPI char* U_EXPORT2 u_austrncpy(char *dst, const UChar *src, int32_t n ); #endif /** * Synonym for memcpy(), but with UChars only. * @param dest The destination string * @param src The source string (can be NULL/invalid if count<=0) * @param count The number of characters to copy; no-op if <=0 * @return A pointer to dest * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_memcpy(UChar *dest, const UChar *src, int32_t count); /** * Synonym for memmove(), but with UChars only. * @param dest The destination string * @param src The source string (can be NULL/invalid if count<=0) * @param count The number of characters to move; no-op if <=0 * @return A pointer to dest * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_memmove(UChar *dest, const UChar *src, int32_t count); /** * Initialize count characters of dest to c. * * @param dest The destination string. * @param c The character to initialize the string. * @param count The maximum number of characters to set. * @return A pointer to dest. * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_memset(UChar *dest, UChar c, int32_t count); /** * Compare the first count UChars of each buffer. * * @param buf1 The first string to compare. * @param buf2 The second string to compare. * @param count The maximum number of UChars to compare. * @return When buf1 < buf2, a negative number is returned. * When buf1 == buf2, 0 is returned. * When buf1 > buf2, a positive number is returned. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count); /** * Compare two Unicode strings in code point order. * This is different in UTF-16 from u_memcmp() if supplementary characters are present. * For details, see u_strCompare(). * * @param s1 A string to compare. * @param s2 A string to compare. * @param count The maximum number of characters to compare. * @return a negative/zero/positive integer corresponding to whether * the first string is less than/equal to/greater than the second one * in code point order * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count); /** * Find the first occurrence of a BMP code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (contains count UChars). * @param c The BMP code point to find. * @param count The length of the string. * @return A pointer to the first occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.0 * * @see u_strchr * @see u_memchr32 * @see u_strFindFirst */ U_CAPI UChar* U_EXPORT2 u_memchr(const UChar *s, UChar c, int32_t count); /** * Find the first occurrence of a code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (contains count UChars). * @param c The code point to find. * @param count The length of the string. * @return A pointer to the first occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.0 * * @see u_strchr32 * @see u_memchr * @see u_strFindFirst */ U_CAPI UChar* U_EXPORT2 u_memchr32(const UChar *s, UChar32 c, int32_t count); /** * Find the last occurrence of a BMP code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (contains count UChars). * @param c The BMP code point to find. * @param count The length of the string. * @return A pointer to the last occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.4 * * @see u_strrchr * @see u_memrchr32 * @see u_strFindLast */ U_CAPI UChar* U_EXPORT2 u_memrchr(const UChar *s, UChar c, int32_t count); /** * Find the last occurrence of a code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (contains count UChars). * @param c The code point to find. * @param count The length of the string. * @return A pointer to the last occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.4 * * @see u_strrchr32 * @see u_memrchr * @see u_strFindLast */ U_CAPI UChar* U_EXPORT2 u_memrchr32(const UChar *s, UChar32 c, int32_t count); /** * Unicode String literals in C. * We need one macro to declare a variable for the string * and to statically preinitialize it if possible, * and a second macro to dynamically initialize such a string variable if necessary. * * The macros are defined for maximum performance. * They work only for strings that contain "invariant characters", i.e., * only latin letters, digits, and some punctuation. * See utypes.h for details. * * A pair of macros for a single string must be used with the same * parameters. * The string parameter must be a C string literal. * The length of the string, not including the terminating * `NUL`, must be specified as a constant. * The U_STRING_DECL macro should be invoked exactly once for one * such string variable before it is used. * * Usage: * * U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11); * U_STRING_DECL(ustringVar2, "jumps 5%", 8); * static UBool didInit=false; * * int32_t function() { * if(!didInit) { * U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11); * U_STRING_INIT(ustringVar2, "jumps 5%", 8); * didInit=true; * } * return u_strcmp(ustringVar1, ustringVar2); * } * * Note that the macros will NOT consistently work if their argument is another #`define`. * The following will not work on all platforms, don't use it. * * #define GLUCK "Mr. Gluck" * U_STRING_DECL(var, GLUCK, 9) * U_STRING_INIT(var, GLUCK, 9) * * Instead, use the string literal "Mr. Gluck" as the argument to both macro * calls. * * * @stable ICU 2.0 */ #if defined(U_DECLARE_UTF16) # define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs) /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16))) # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=cs /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) #else # define U_STRING_DECL(var, cs, length) static UChar var[(length)+1] /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1) #endif /** * Unescape a string of characters and write the resulting * Unicode characters to the destination buffer. The following escape * sequences are recognized: * * \\uhhhh 4 hex digits; h in [0-9A-Fa-f] * \\Uhhhhhhhh 8 hex digits * \\xhh 1-2 hex digits * \\x{h...} 1-8 hex digits * \\ooo 1-3 octal digits; o in [0-7] * \\cX control-X; X is masked with 0x1F * * as well as the standard ANSI C escapes: * * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A, * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B, * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C * * Anything else following a backslash is generically escaped. For * example, "[a\\-z]" returns "[a-z]". * * If an escape sequence is ill-formed, this method returns an empty * string. An example of an ill-formed sequence is "\\u" followed by * fewer than 4 hex digits. * * The above characters are recognized in the compiler's codepage, * that is, they are coded as 'u', '\\', etc. Characters that are * not parts of escape sequences are converted using u_charsToUChars(). * * This function is similar to UnicodeString::unescape() but not * identical to it. The latter takes a source UnicodeString, so it * does escape recognition but no conversion. * * @param src a zero-terminated string of invariant characters * @param dest pointer to buffer to receive converted and unescaped * text and, if there is room, a zero terminator. May be NULL for * preflighting, in which case no UChars will be written, but the * return value will still be valid. On error, an empty string is * stored here (if possible). * @param destCapacity the number of UChars that may be written at * dest. Ignored if dest == NULL. * @return the length of unescaped string. * @see u_unescapeAt * @see UnicodeString#unescape() * @see UnicodeString#unescapeAt() * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_unescape(const char *src, UChar *dest, int32_t destCapacity); U_CDECL_BEGIN /** * Callback function for u_unescapeAt() that returns a character of * the source text given an offset and a context pointer. The context * pointer will be whatever is passed into u_unescapeAt(). * * @param offset pointer to the offset that will be passed to u_unescapeAt(). * @param context an opaque pointer passed directly into u_unescapeAt() * @return the character represented by the escape sequence at * offset * @see u_unescapeAt * @stable ICU 2.0 */ typedef UChar (U_CALLCONV *UNESCAPE_CHAR_AT)(int32_t offset, void *context); U_CDECL_END /** * Unescape a single sequence. The character at offset-1 is assumed * (without checking) to be a backslash. This method takes a callback * pointer to a function that returns the UChar at a given offset. By * varying this callback, ICU functions are able to unescape char* * strings, UnicodeString objects, and UFILE pointers. * * If offset is out of range, or if the escape sequence is ill-formed, * (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape() * for a list of recognized sequences. * * @param charAt callback function that returns a UChar of the source * text given an offset and a context pointer. * @param offset pointer to the offset that will be passed to charAt. * The offset value will be updated upon return to point after the * last parsed character of the escape sequence. On error the offset * is unchanged. * @param length the number of characters in the source text. The * last character of the source text is considered to be at offset * length-1. * @param context an opaque pointer passed directly into charAt. * @return the character represented by the escape sequence at * offset, or (UChar32)0xFFFFFFFF on error. * @see u_unescape() * @see UnicodeString#unescape() * @see UnicodeString#unescapeAt() * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_unescapeAt(UNESCAPE_CHAR_AT charAt, int32_t *offset, int32_t length, void *context); /** * Uppercase the characters in a string. * Casing is locale-dependent and context-sensitive. * The result may be longer or shorter than the original. * The source string and the destination buffer are allowed to overlap. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string. It may be greater than destCapacity. In that case, * only some of the result was written to the destination buffer. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strToUpper(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); /** * Lowercase the characters in a string. * Casing is locale-dependent and context-sensitive. * The result may be longer or shorter than the original. * The source string and the destination buffer are allowed to overlap. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string. It may be greater than destCapacity. In that case, * only some of the result was written to the destination buffer. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strToLower(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); #if !UCONFIG_NO_BREAK_ITERATION /** * Titlecase a string. * Casing is locale-dependent and context-sensitive. * Titlecasing uses a break iterator to find the first characters of words * that are to be titlecased. It titlecases those characters and lowercases * all others. * * The titlecase break iterator can be provided to customize for arbitrary * styles, using rules and dictionaries beyond the standard iterators. * It may be more efficient to always provide an iterator to avoid * opening and closing one for each string. * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * * This function uses only the setText(), first() and next() methods of the * provided break iterator. * * The result may be longer or shorter than the original. * The source string and the destination buffer are allowed to overlap. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param titleIter A break iterator to find the first characters of words * that are to be titlecased. * If none is provided (NULL), then a standard titlecase * break iterator is opened. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string. It may be greater than destCapacity. In that case, * only some of the result was written to the destination buffer. * @stable ICU 2.1 */ U_CAPI int32_t U_EXPORT2 u_strToTitle(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBreakIterator *titleIter, const char *locale, UErrorCode *pErrorCode); #endif /** * Case-folds the characters in a string. * * Case-folding is locale-independent and not context-sensitive, * but there is an option for whether to include or exclude mappings for dotted I * and dotless i that are marked with 'T' in CaseFolding.txt. * * The result may be longer or shorter than the original. * The source string and the destination buffer are allowed to overlap. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string. It may be greater than destCapacity. In that case, * only some of the result was written to the destination buffer. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_strFoldCase(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, uint32_t options, UErrorCode *pErrorCode); #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION /** * Convert a UTF-16 string to a wchar_t string. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then * this function simply calls the fast, dedicated function for that. * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. * @stable ICU 2.0 */ U_CAPI wchar_t* U_EXPORT2 u_strToWCS(wchar_t *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a wchar_t string to UTF-16. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then * this function simply calls the fast, dedicated function for that. * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. * @stable ICU 2.0 */ U_CAPI UChar* U_EXPORT2 u_strFromWCS(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const wchar_t *src, int32_t srcLength, UErrorCode *pErrorCode); #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */ /** * Convert a UTF-16 string to UTF-8. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of chars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. * @stable ICU 2.0 * @see u_strToUTF8WithSub * @see u_strFromUTF8 */ U_CAPI char* U_EXPORT2 u_strToUTF8(char *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a UTF-8 string to UTF-16. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. * @stable ICU 2.0 * @see u_strFromUTF8WithSub * @see u_strFromUTF8Lenient */ U_CAPI UChar* U_EXPORT2 u_strFromUTF8(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a UTF-16 string to UTF-8. * * Same as u_strToUTF8() except for the additional subchar which is output for * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8(). * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of chars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param subchar The substitution character to use in place of an illegal input sequence, * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. * A substitution character can be any valid Unicode code point (up to U+10FFFF) * except for surrogate code points (U+D800..U+DFFF). * The recommended value is U+FFFD "REPLACEMENT CHARACTER". * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strToUTF8 * @see u_strFromUTF8WithSub * @stable ICU 3.6 */ U_CAPI char* U_EXPORT2 u_strToUTF8WithSub(char *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); /** * Convert a UTF-8 string to UTF-16. * * Same as u_strFromUTF8() except for the additional subchar which is output for * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8(). * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param subchar The substitution character to use in place of an illegal input sequence, * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. * A substitution character can be any valid Unicode code point (up to U+10FFFF) * except for surrogate code points (U+D800..U+DFFF). * The recommended value is U+FFFD "REPLACEMENT CHARACTER". * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strFromUTF8 * @see u_strFromUTF8Lenient * @see u_strToUTF8WithSub * @stable ICU 3.6 */ U_CAPI UChar* U_EXPORT2 u_strFromUTF8WithSub(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); /** * Convert a UTF-8 string to UTF-16. * * Same as u_strFromUTF8() except that this function is designed to be very fast, * which it achieves by being lenient about malformed UTF-8 sequences. * This function is intended for use in environments where UTF-8 text is * expected to be well-formed. * * Its semantics are: * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text. * - The function will not read beyond the input string, nor write beyond * the destCapacity. * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not * be well-formed UTF-16. * The function will resynchronize to valid code point boundaries * within a small number of code points after an illegal sequence. * - Non-shortest forms are not detected and will result in "spoofing" output. * * For further performance improvement, if srcLength is given (>=0), * then it must be destCapacity>=srcLength. * * There is no inverse u_strToUTF8Lenient() function because there is practically * no performance gain from not checking that a UTF-16 string is well-formed. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * Unlike for other ICU functions, if srcLength>=0 then it * must be destCapacity>=srcLength. * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * Unlike for other ICU functions, if srcLength>=0 but * destCapacity=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strToUTF32 * @see u_strFromUTF32WithSub * @stable ICU 4.2 */ U_CAPI UChar32* U_EXPORT2 u_strToUTF32WithSub(UChar32 *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); /** * Convert a UTF-32 string to UTF-16. * * Same as u_strFromUTF32() except for the additional subchar which is output for * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32(). * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param subchar The substitution character to use in place of an illegal input sequence, * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. * A substitution character can be any valid Unicode code point (up to U+10FFFF) * except for surrogate code points (U+D800..U+DFFF). * The recommended value is U+FFFD "REPLACEMENT CHARACTER". * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strFromUTF32 * @see u_strToUTF32WithSub * @stable ICU 4.2 */ U_CAPI UChar* U_EXPORT2 u_strFromUTF32WithSub(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const UChar32 *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); /** * Convert a 16-bit Unicode string to Java Modified UTF-8. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8 * * This function behaves according to the documentation for Java DataOutput.writeUTF() * except that it does not encode the output length in the destination buffer * and does not have an output length restriction. * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String) * * The input string need not be well-formed UTF-16. * (Therefore there is no subchar parameter.) * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of chars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @stable ICU 4.4 * @see u_strToUTF8WithSub * @see u_strFromJavaModifiedUTF8WithSub */ U_CAPI char* U_EXPORT2 u_strToJavaModifiedUTF8( char *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a Java Modified UTF-8 string to a 16-bit Unicode string. * If the input string is not well-formed and no substitution char is specified, * then the U_INVALID_CHAR_FOUND error code is set. * * This function behaves according to the documentation for Java DataInput.readUTF() * except that it takes a length parameter rather than * interpreting the first two input bytes as the length. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF() * * The output string may not be well-formed UTF-16. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param subchar The substitution character to use in place of an illegal input sequence, * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. * A substitution character can be any valid Unicode code point (up to U+10FFFF) * except for surrogate code points (U+D800..U+DFFF). * The recommended value is U+FFFD "REPLACEMENT CHARACTER". * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strFromUTF8WithSub * @see u_strFromUTF8Lenient * @see u_strToJavaModifiedUTF8 * @stable ICU 4.4 */ U_CAPI UChar* U_EXPORT2 u_strFromJavaModifiedUTF8WithSub( UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); #endif // ucasemap.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: ucasemap.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2005may06 * created by: Markus W. Scherer * * Case mapping service object and functions using it. */ #ifndef __UCASEMAP_H__ #define __UCASEMAP_H__ /** * \file * \brief C API: Unicode case mapping functions using a UCaseMap service object. * * The service object takes care of memory allocations, data loading, and setup * for the attributes, as usual. * * Currently, the functionality provided here does not overlap with uchar.h * and ustring.h, except for ucasemap_toTitle(). * * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings. */ /** * UCaseMap is an opaque service object for newer ICU case mapping functions. * Older functions did not use a service object. * @stable ICU 3.4 */ struct UCaseMap; typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */ /** * Open a UCaseMap service object for a locale and a set of options. * The locale ID and options are preprocessed so that functions using the * service object need not process them in each call. * * @param locale ICU locale ID, used for language-dependent * upper-/lower-/title-casing according to the Unicode standard. * Usual semantics: ""=root, NULL=default locale, etc. * @param options Options bit set, used for case folding and string comparisons. * Same flags as for u_foldCase(), u_strFoldCase(), * u_strCaseCompare(), etc. * Use 0 or U_FOLD_CASE_DEFAULT for default behavior. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return Pointer to a UCaseMap service object, if successful. * * @see U_FOLD_CASE_DEFAULT * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I * @see U_TITLECASE_NO_LOWERCASE * @see U_TITLECASE_NO_BREAK_ADJUSTMENT * @stable ICU 3.4 */ U_CAPI UCaseMap * U_EXPORT2 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode); /** * Close a UCaseMap service object. * @param csm Object to be closed. * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ucasemap_close(UCaseMap *csm); /** * Get the locale ID that is used for language-dependent case mappings. * @param csm UCaseMap service object. * @return locale ID * @stable ICU 3.4 */ U_CAPI const char * U_EXPORT2 ucasemap_getLocale(const UCaseMap *csm); /** * Get the options bit set that is used for case folding and string comparisons. * @param csm UCaseMap service object. * @return options bit set * @stable ICU 3.4 */ U_CAPI uint32_t U_EXPORT2 ucasemap_getOptions(const UCaseMap *csm); /** * Set the locale ID that is used for language-dependent case mappings. * * @param csm UCaseMap service object. * @param locale Locale ID, see ucasemap_open(). * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see ucasemap_open * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode); /** * Set the options bit set that is used for case folding and string comparisons. * * @param csm UCaseMap service object. * @param options Options bit set, see ucasemap_open(). * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see ucasemap_open * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode); #if !UCONFIG_NO_BREAK_ITERATION /** * Get the break iterator that is used for titlecasing. * Do not modify the returned break iterator. * @param csm UCaseMap service object. * @return titlecasing break iterator * @stable ICU 3.8 */ U_CAPI const UBreakIterator * U_EXPORT2 ucasemap_getBreakIterator(const UCaseMap *csm); /** * Set the break iterator that is used for titlecasing. * The UCaseMap service object releases a previously set break iterator * and "adopts" this new one, taking ownership of it. * It will be released in a subsequent call to ucasemap_setBreakIterator() * or ucasemap_close(). * * Break iterator operations are not thread-safe. Therefore, titlecasing * functions use non-const UCaseMap objects. It is not possible to titlecase * strings concurrently using the same UCaseMap. * * @param csm UCaseMap service object. * @param iterToAdopt Break iterator to be adopted for titlecasing. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see ucasemap_toTitle * @see ucasemap_utf8ToTitle * @stable ICU 3.8 */ U_CAPI void U_EXPORT2 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode); /** * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(), * except that it takes ucasemap_setOptions() into account and has performance * advantages from being able to use a UCaseMap object for multiple case mapping * operations, saving setup time. * * Casing is locale-dependent and context-sensitive. * Titlecasing uses a break iterator to find the first characters of words * that are to be titlecased. It titlecases those characters and lowercases * all others. (This can be modified with ucasemap_setOptions().) * * Note: This function takes a non-const UCaseMap pointer because it will * open a default break iterator if no break iterator was set yet, * and effectively call ucasemap_setBreakIterator(); * also because the break iterator is stateful and will be modified during * the iteration. * * The titlecase break iterator can be provided to customize for arbitrary * styles, using rules and dictionaries beyond the standard iterators. * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * * This function uses only the setText(), first() and next() methods of the * provided break iterator. * * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. This pointer is non-const! * See the note above for details. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strToTitle * @stable ICU 3.8 */ U_CAPI int32_t U_EXPORT2 ucasemap_toTitle(UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); #endif // UCONFIG_NO_BREAK_ITERATION /** * Lowercase the characters in a UTF-8 string. * Casing is locale-dependent and context-sensitive. * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strToLower * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 ucasemap_utf8ToLower(const UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Uppercase the characters in a UTF-8 string. * Casing is locale-dependent and context-sensitive. * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strToUpper * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 ucasemap_utf8ToUpper(const UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); #if !UCONFIG_NO_BREAK_ITERATION /** * Titlecase a UTF-8 string. * Casing is locale-dependent and context-sensitive. * Titlecasing uses a break iterator to find the first characters of words * that are to be titlecased. It titlecases those characters and lowercases * all others. (This can be modified with ucasemap_setOptions().) * * Note: This function takes a non-const UCaseMap pointer because it will * open a default break iterator if no break iterator was set yet, * and effectively call ucasemap_setBreakIterator(); * also because the break iterator is stateful and will be modified during * the iteration. * * The titlecase break iterator can be provided to customize for arbitrary * styles, using rules and dictionaries beyond the standard iterators. * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * * This function uses only the setUText(), first(), next() and close() methods of the * provided break iterator. * * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. This pointer is non-const! * See the note above for details. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strToTitle * @see U_TITLECASE_NO_LOWERCASE * @see U_TITLECASE_NO_BREAK_ADJUSTMENT * @stable ICU 3.8 */ U_CAPI int32_t U_EXPORT2 ucasemap_utf8ToTitle(UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); #endif /** * Case-folds the characters in a UTF-8 string. * * Case-folding is locale-independent and not context-sensitive, * but there is an option for whether to include or exclude mappings for dotted I * and dotless i that are marked with 'T' in CaseFolding.txt. * * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strFoldCase * @see ucasemap_setOptions * @see U_FOLD_CASE_DEFAULT * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I * @stable ICU 3.8 */ U_CAPI int32_t U_EXPORT2 ucasemap_utf8FoldCase(const UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); #endif // parseerr.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2005, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 03/14/00 aliu Creation. * 06/27/00 aliu Change from C++ class to C struct ********************************************************************** */ #ifndef PARSEERR_H #define PARSEERR_H /** * \file * \brief C API: Parse Error Information */ /** * The capacity of the context strings in UParseError. * @stable ICU 2.0 */ enum { U_PARSE_CONTEXT_LEN = 16 }; /** * A UParseError struct is used to returned detailed information about * parsing errors. It is used by ICU parsing engines that parse long * rules, patterns, or programs, where the text being parsed is long * enough that more information than a UErrorCode is needed to * localize the error. * *

The line, offset, and context fields are optional; parsing * engines may choose not to use to use them. * *

The preContext and postContext strings include some part of the * context surrounding the error. If the source text is "let for=7" * and "for" is the error (e.g., because it is a reserved word), then * some examples of what a parser might produce are the following: * *

 * preContext   postContext
 * ""           ""            The parser does not support context
 * "let "       "=7"          Pre- and post-context only
 * "let "       "for=7"       Pre- and post-context and error text
 * ""           "for"         Error text only
 * 
* *

Examples of engines which use UParseError (or may use it in the * future) are Transliterator, RuleBasedBreakIterator, and * RegexPattern. * * @stable ICU 2.0 */ typedef struct UParseError { /** * The line on which the error occurred. If the parser uses this * field, it sets it to the line number of the source text line on * which the error appears, which will be a value >= 1. If the * parse does not support line numbers, the value will be <= 0. * @stable ICU 2.0 */ int32_t line; /** * The character offset to the error. If the line field is >= 1, * then this is the offset from the start of the line. Otherwise, * this is the offset from the start of the text. If the parser * does not support this field, it will have a value < 0. * @stable ICU 2.0 */ int32_t offset; /** * Textual context before the error. Null-terminated. The empty * string if not supported by parser. * @stable ICU 2.0 */ UChar preContext[U_PARSE_CONTEXT_LEN]; /** * The error itself and/or textual context after the error. * Null-terminated. The empty string if not supported by parser. * @stable ICU 2.0 */ UChar postContext[U_PARSE_CONTEXT_LEN]; } UParseError; #endif // usprep.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2003-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: usprep.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2003jul2 * created by: Ram Viswanadha */ #ifndef __USPREP_H__ #define __USPREP_H__ /** * \file * \brief C API: Implements the StringPrep algorithm. */ /** * * StringPrep API implements the StingPrep framework as described by RFC 3454. * StringPrep prepares Unicode strings for use in network protocols. * Profiles of StingPrep are set of rules and data according to with the * Unicode Strings are prepared. Each profiles contains tables which describe * how a code point should be treated. The tables are broadly classified into *

    *
  • Unassigned Table: Contains code points that are unassigned * in the Unicode Version supported by StringPrep. Currently * RFC 3454 supports Unicode 3.2.
  • *
  • Prohibited Table: Contains code points that are prohibited from * the output of the StringPrep processing function.
  • *
  • Mapping Table: Contains code points that are deleted from the output or case mapped.
  • *
* * The procedure for preparing Unicode strings: *
    *
  1. Map: For each character in the input, check if it has a mapping * and, if so, replace it with its mapping.
  2. *
  3. Normalize: Possibly normalize the result of step 1 using Unicode * normalization.
  4. *
  5. Prohibit: Check for any characters that are not allowed in the * output. If any are found, return an error.
  6. *
  7. Check bidi: Possibly check for right-to-left characters, and if * any are found, make sure that the whole string satisfies the * requirements for bidirectional strings. If the string does not * satisfy the requirements for bidirectional strings, return an * error.
  8. *
* @author Ram Viswanadha */ #if !UCONFIG_NO_IDNA /** * The StringPrep profile * @stable ICU 2.8 */ typedef struct UStringPrepProfile UStringPrepProfile; /** * Option to prohibit processing of unassigned code points in the input * * @see usprep_prepare * @stable ICU 2.8 */ #define USPREP_DEFAULT 0x0000 /** * Option to allow processing of unassigned code points in the input * * @see usprep_prepare * @stable ICU 2.8 */ #define USPREP_ALLOW_UNASSIGNED 0x0001 /** * enums for the standard stringprep profile types * supported by usprep_openByType. * @see usprep_openByType * @stable ICU 4.2 */ typedef enum UStringPrepProfileType { /** * RFC3491 Nameprep * @stable ICU 4.2 */ USPREP_RFC3491_NAMEPREP, /** * RFC3530 nfs4_cs_prep * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_CS_PREP, /** * RFC3530 nfs4_cs_prep with case insensitive option * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_CS_PREP_CI, /** * RFC3530 nfs4_cis_prep * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_CIS_PREP, /** * RFC3530 nfs4_mixed_prep for prefix * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_MIXED_PREP_PREFIX, /** * RFC3530 nfs4_mixed_prep for suffix * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_MIXED_PREP_SUFFIX, /** * RFC3722 iSCSI * @stable ICU 4.2 */ USPREP_RFC3722_ISCSI, /** * RFC3920 XMPP Nodeprep * @stable ICU 4.2 */ USPREP_RFC3920_NODEPREP, /** * RFC3920 XMPP Resourceprep * @stable ICU 4.2 */ USPREP_RFC3920_RESOURCEPREP, /** * RFC4011 Policy MIB Stringprep * @stable ICU 4.2 */ USPREP_RFC4011_MIB, /** * RFC4013 SASLprep * @stable ICU 4.2 */ USPREP_RFC4013_SASLPREP, /** * RFC4505 trace * @stable ICU 4.2 */ USPREP_RFC4505_TRACE, /** * RFC4518 LDAP * @stable ICU 4.2 */ USPREP_RFC4518_LDAP, /** * RFC4518 LDAP for case ignore, numeric and stored prefix * matching rules * @stable ICU 4.2 */ USPREP_RFC4518_LDAP_CI } UStringPrepProfileType; /** * Creates a StringPrep profile from the data file. * * @param path string containing the full path pointing to the directory * where the profile reside followed by the package name * e.g. "/usr/resource/my_app/profiles/mydata" on a Unix system. * if NULL, ICU default data files will be used. * @param fileName name of the profile file to be opened * @param status ICU error code in/out parameter. Must not be NULL. * Must fulfill U_SUCCESS before the function call. * @return Pointer to UStringPrepProfile that is opened. Should be closed by * calling usprep_close() * @see usprep_close() * @stable ICU 2.8 */ U_CAPI UStringPrepProfile* U_EXPORT2 usprep_open(const char* path, const char* fileName, UErrorCode* status); /** * Creates a StringPrep profile for the specified profile type. * * @param type The profile type * @param status ICU error code in/out parameter. Must not be NULL. * Must fulfill U_SUCCESS before the function call. * @return Pointer to UStringPrepProfile that is opened. Should be closed by * calling usprep_close() * @see usprep_close() * @stable ICU 4.2 */ U_CAPI UStringPrepProfile* U_EXPORT2 usprep_openByType(UStringPrepProfileType type, UErrorCode* status); /** * Closes the profile * @param profile The profile to close * @stable ICU 2.8 */ U_CAPI void U_EXPORT2 usprep_close(UStringPrepProfile* profile); /** * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), * checks for prohibited and BiDi characters in the order defined by RFC 3454 * depending on the options specified in the profile. * * @param prep The profile to use * @param src Pointer to UChar buffer containing the string to prepare * @param srcLength Number of characters in the source string * @param dest Pointer to the destination buffer to receive the output * @param destCapacity The capacity of destination array * @param options A bit set of options: * * - USPREP_DEFAULT Prohibit processing of unassigned code points in the input * * - USPREP_ALLOW_UNASSIGNED Treat the unassigned code points are in the input * as normal Unicode code points. * * @param parseError Pointer to UParseError struct to receive information on position * of error if an error is encountered. Can be NULL. * @param status ICU in/out error code parameter. * U_INVALID_CHAR_FOUND if src contains * unmatched single surrogates. * U_INDEX_OUTOFBOUNDS_ERROR if src contains * too many code points. * U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough * @return The number of UChars in the destination buffer * @stable ICU 2.8 */ U_CAPI int32_t U_EXPORT2 usprep_prepare( const UStringPrepProfile* prep, const UChar* src, int32_t srcLength, UChar* dest, int32_t destCapacity, int32_t options, UParseError* parseError, UErrorCode* status ); #endif /* #if !UCONFIG_NO_IDNA */ #endif // uidna.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2003-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uidna.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2003feb1 * created by: Ram Viswanadha */ #ifndef __UIDNA_H__ #define __UIDNA_H__ #if !UCONFIG_NO_IDNA /** * \file * \brief C API: Internationalizing Domain Names in Applications (IDNA) * * IDNA2008 is implemented according to UTS #46, see the IDNA C++ class in idna.h. * * The C API functions which do take a UIDNA * service object pointer * implement UTS #46 and IDNA2008. * * IDNA2003 is obsolete. * The C API functions which do not take a service object pointer * implement IDNA2003. They are all deprecated. */ /* * IDNA option bit set values. */ enum { /** * Default options value: None of the other options are set. * For use in static worker and factory methods. * @stable ICU 2.6 */ UIDNA_DEFAULT=0, /** * Option to check whether the input conforms to the STD3 ASCII rules, * for example the restriction of labels to LDH characters * (ASCII Letters, Digits and Hyphen-Minus). * For use in static worker and factory methods. * @stable ICU 2.6 */ UIDNA_USE_STD3_RULES=2, /** * IDNA option to check for whether the input conforms to the BiDi rules. * For use in static worker and factory methods. *

This option is ignored by the IDNA2003 implementation. * (IDNA2003 always performs a BiDi check.) * @stable ICU 4.6 */ UIDNA_CHECK_BIDI=4, /** * IDNA option to check for whether the input conforms to the CONTEXTJ rules. * For use in static worker and factory methods. *

This option is ignored by the IDNA2003 implementation. * (The CONTEXTJ check is new in IDNA2008.) * @stable ICU 4.6 */ UIDNA_CHECK_CONTEXTJ=8, /** * IDNA option for nontransitional processing in ToASCII(). * For use in static worker and factory methods. *

By default, ToASCII() uses transitional processing. *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @stable ICU 4.6 */ UIDNA_NONTRANSITIONAL_TO_ASCII=0x10, /** * IDNA option for nontransitional processing in ToUnicode(). * For use in static worker and factory methods. *

By default, ToUnicode() uses transitional processing. *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @stable ICU 4.6 */ UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20, /** * IDNA option to check for whether the input conforms to the CONTEXTO rules. * For use in static worker and factory methods. *

This option is ignored by the IDNA2003 implementation. * (The CONTEXTO check is new in IDNA2008.) *

This is for use by registries for IDNA2008 conformance. * UTS #46 does not require the CONTEXTO check. * @stable ICU 49 */ UIDNA_CHECK_CONTEXTO=0x40 }; /** * Opaque C service object type for the new IDNA API. * @stable ICU 4.6 */ struct UIDNA; typedef struct UIDNA UIDNA; /**< C typedef for struct UIDNA. @stable ICU 4.6 */ /** * Returns a UIDNA instance which implements UTS #46. * Returns an unmodifiable instance, owned by the caller. * Cache it for multiple operations, and uidna_close() it when done. * The instance is thread-safe, that is, it can be used concurrently. * * For details about the UTS #46 implementation see the IDNA C++ class in idna.h. * * @param options Bit set to modify the processing and error checking. * See option bit set values in uidna.h. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the UTS #46 UIDNA instance, if successful * @stable ICU 4.6 */ U_CAPI UIDNA * U_EXPORT2 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode); /** * Closes a UIDNA instance. * @param idna UIDNA instance to be closed * @stable ICU 4.6 */ U_CAPI void U_EXPORT2 uidna_close(UIDNA *idna); /** * Output container for IDNA processing errors. * Initialize with UIDNA_INFO_INITIALIZER: * \code * UIDNAInfo info = UIDNA_INFO_INITIALIZER; * int32_t length = uidna_nameToASCII(..., &info, &errorCode); * if(U_SUCCESS(errorCode) && info.errors!=0) { ... } * \endcode * @stable ICU 4.6 */ typedef struct UIDNAInfo { /** sizeof(UIDNAInfo) @stable ICU 4.6 */ int16_t size; /** * Set to true if transitional and nontransitional processing produce different results. * For details see C++ IDNAInfo::isTransitionalDifferent(). * @stable ICU 4.6 */ UBool isTransitionalDifferent; UBool reservedB3; /**< Reserved field, do not use. @internal */ /** * Bit set indicating IDNA processing errors. 0 if no errors. * See UIDNA_ERROR_... constants. * @stable ICU 4.6 */ uint32_t errors; int32_t reservedI2; /**< Reserved field, do not use. @internal */ int32_t reservedI3; /**< Reserved field, do not use. @internal */ } UIDNAInfo; /** * Static initializer for a UIDNAInfo struct. * @stable ICU 4.6 */ #if (NTDDI_VERSION >= NTDDI_WIN10_CO) #define UIDNA_INFO_INITIALIZER { \ (int16_t)sizeof(UIDNAInfo), \ false, false, \ 0, 0, 0 } #else #define UIDNA_INFO_INITIALIZER { \ (int16_t)sizeof(UIDNAInfo), \ FALSE, FALSE, \ 0, 0, 0 } #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Converts a single domain name label into its ASCII form for DNS lookup. * If any processing step fails, then pInfo->errors will be non-zero and * the result might not be an ASCII string. * The label might be modified according to the types of errors. * Labels with severe errors will be left in (or turned into) their Unicode form. * * The UErrorCode indicates an error only in exceptional cases, * such as a U_MEMORY_ALLOCATION_ERROR. * * @param idna UIDNA instance * @param label Input domain name label * @param length Label length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uidna_labelToASCII(const UIDNA *idna, const UChar *label, int32_t length, UChar *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a single domain name label into its Unicode form for human-readable display. * If any processing step fails, then pInfo->errors will be non-zero. * The label might be modified according to the types of errors. * * The UErrorCode indicates an error only in exceptional cases, * such as a U_MEMORY_ALLOCATION_ERROR. * * @param idna UIDNA instance * @param label Input domain name label * @param length Label length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uidna_labelToUnicode(const UIDNA *idna, const UChar *label, int32_t length, UChar *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a whole domain name into its ASCII form for DNS lookup. * If any processing step fails, then pInfo->errors will be non-zero and * the result might not be an ASCII string. * The domain name might be modified according to the types of errors. * Labels with severe errors will be left in (or turned into) their Unicode form. * * The UErrorCode indicates an error only in exceptional cases, * such as a U_MEMORY_ALLOCATION_ERROR. * * @param idna UIDNA instance * @param name Input domain name * @param length Domain name length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uidna_nameToASCII(const UIDNA *idna, const UChar *name, int32_t length, UChar *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a whole domain name into its Unicode form for human-readable display. * If any processing step fails, then pInfo->errors will be non-zero. * The domain name might be modified according to the types of errors. * * The UErrorCode indicates an error only in exceptional cases, * such as a U_MEMORY_ALLOCATION_ERROR. * * @param idna UIDNA instance * @param name Input domain name * @param length Domain name length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uidna_nameToUnicode(const UIDNA *idna, const UChar *name, int32_t length, UChar *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /* UTF-8 versions of the processing methods --------------------------------- */ /** * Converts a single domain name label into its ASCII form for DNS lookup. * UTF-8 version of uidna_labelToASCII(), same behavior. * * @param idna UIDNA instance * @param label Input domain name label * @param length Label length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uidna_labelToASCII_UTF8(const UIDNA *idna, const char *label, int32_t length, char *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a single domain name label into its Unicode form for human-readable display. * UTF-8 version of uidna_labelToUnicode(), same behavior. * * @param idna UIDNA instance * @param label Input domain name label * @param length Label length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uidna_labelToUnicodeUTF8(const UIDNA *idna, const char *label, int32_t length, char *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a whole domain name into its ASCII form for DNS lookup. * UTF-8 version of uidna_nameToASCII(), same behavior. * * @param idna UIDNA instance * @param name Input domain name * @param length Domain name length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uidna_nameToASCII_UTF8(const UIDNA *idna, const char *name, int32_t length, char *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a whole domain name into its Unicode form for human-readable display. * UTF-8 version of uidna_nameToUnicode(), same behavior. * * @param idna UIDNA instance * @param name Input domain name * @param length Domain name length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uidna_nameToUnicodeUTF8(const UIDNA *idna, const char *name, int32_t length, char *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /* * IDNA error bit set values. * When a domain name or label fails a processing step or does not meet the * validity criteria, then one or more of these error bits are set. */ enum { /** * A non-final domain name label (or the whole domain name) is empty. * @stable ICU 4.6 */ UIDNA_ERROR_EMPTY_LABEL=1, /** * A domain name label is longer than 63 bytes. * (See STD13/RFC1034 3.1. Name space specifications and terminology.) * This is only checked in ToASCII operations, and only if the output label is all-ASCII. * @stable ICU 4.6 */ UIDNA_ERROR_LABEL_TOO_LONG=2, /** * A domain name is longer than 255 bytes in its storage form. * (See STD13/RFC1034 3.1. Name space specifications and terminology.) * This is only checked in ToASCII operations, and only if the output domain name is all-ASCII. * @stable ICU 4.6 */ UIDNA_ERROR_DOMAIN_NAME_TOO_LONG=4, /** * A label starts with a hyphen-minus ('-'). * @stable ICU 4.6 */ UIDNA_ERROR_LEADING_HYPHEN=8, /** * A label ends with a hyphen-minus ('-'). * @stable ICU 4.6 */ UIDNA_ERROR_TRAILING_HYPHEN=0x10, /** * A label contains hyphen-minus ('-') in the third and fourth positions. * @stable ICU 4.6 */ UIDNA_ERROR_HYPHEN_3_4=0x20, /** * A label starts with a combining mark. * @stable ICU 4.6 */ UIDNA_ERROR_LEADING_COMBINING_MARK=0x40, /** * A label or domain name contains disallowed characters. * @stable ICU 4.6 */ UIDNA_ERROR_DISALLOWED=0x80, /** * A label starts with "xn--" but does not contain valid Punycode. * That is, an xn-- label failed Punycode decoding. * @stable ICU 4.6 */ UIDNA_ERROR_PUNYCODE=0x100, /** * A label contains a dot=full stop. * This can occur in an input string for a single-label function. * @stable ICU 4.6 */ UIDNA_ERROR_LABEL_HAS_DOT=0x200, /** * An ACE label does not contain a valid label string. * The label was successfully ACE (Punycode) decoded but the resulting * string had severe validation errors. For example, * it might contain characters that are not allowed in ACE labels, * or it might not be normalized. * @stable ICU 4.6 */ UIDNA_ERROR_INVALID_ACE_LABEL=0x400, /** * A label does not meet the IDNA BiDi requirements (for right-to-left characters). * @stable ICU 4.6 */ UIDNA_ERROR_BIDI=0x800, /** * A label does not meet the IDNA CONTEXTJ requirements. * @stable ICU 4.6 */ UIDNA_ERROR_CONTEXTJ=0x1000, /** * A label does not meet the IDNA CONTEXTO requirements for punctuation characters. * Some punctuation characters "Would otherwise have been DISALLOWED" * but are allowed in certain contexts. (RFC 5892) * @stable ICU 49 */ UIDNA_ERROR_CONTEXTO_PUNCTUATION=0x2000, /** * A label does not meet the IDNA CONTEXTO requirements for digits. * Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx). * @stable ICU 49 */ UIDNA_ERROR_CONTEXTO_DIGITS=0x4000 }; #endif /* #if !UCONFIG_NO_IDNA */ #endif // ubrk.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 1996-2015, International Business Machines Corporation and others. * All Rights Reserved. ****************************************************************************** */ #ifndef UBRK_H #define UBRK_H /** * A text-break iterator. * For usage in C programs. */ #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR # define UBRK_TYPEDEF_UBREAK_ITERATOR /** * Opaque type representing an ICU Break iterator object. * @stable ICU 2.0 */ typedef struct UBreakIterator UBreakIterator; #endif #if !UCONFIG_NO_BREAK_ITERATION /** * \file * \brief C API: BreakIterator * *

BreakIterator C API

* * The BreakIterator C API defines methods for finding the location * of boundaries in text. Pointer to a UBreakIterator maintain a * current position and scan over text returning the index of characters * where boundaries occur. *

* Line boundary analysis determines where a text string can be broken * when line-wrapping. The mechanism correctly handles punctuation and * hyphenated words. *

* Note: The locale keyword "lb" can be used to modify line break * behavior according to the CSS level 3 line-break options, see * . For example: * "ja@lb=strict", "zh@lb=loose". *

* Sentence boundary analysis allows selection with correct * interpretation of periods within numbers and abbreviations, and * trailing punctuation marks such as quotation marks and parentheses. *

* Note: The locale keyword "ss" can be used to enable use of * segmentation suppression data (preventing breaks in English after * abbreviations such as "Mr." or "Est.", for example), as follows: * "en@ss=standard". *

* Word boundary analysis is used by search and replace functions, as * well as within text editing applications that allow the user to * select words with a double click. Word selection provides correct * interpretation of punctuation marks within and following * words. Characters that are not part of a word, such as symbols or * punctuation marks, have word-breaks on both sides. *

* Character boundary analysis identifies the boundaries of * "Extended Grapheme Clusters", which are groupings of codepoints * that should be treated as character-like units for many text operations. * Please see Unicode Standard Annex #29, Unicode Text Segmentation, * http://www.unicode.org/reports/tr29/ for additional information * on grapheme clusters and guidelines on their use. *

* Title boundary analysis locates all positions, * typically starts of words, that should be set to Title Case * when title casing the text. *

* The text boundary positions are found according to the rules * described in Unicode Standard Annex #29, Text Boundaries, and * Unicode Standard Annex #14, Line Breaking Properties. These * are available at http://www.unicode.org/reports/tr14/ and * http://www.unicode.org/reports/tr29/. *

* In addition to the plain C API defined in this header file, an * object oriented C++ API with equivalent functionality is defined in the * file brkiter.h. *

* Code snippets illustrating the use of the Break Iterator APIs * are available in the ICU User Guide, * https://unicode-org.github.io/icu/userguide/boundaryanalysis/ * and in the sample program icu/source/samples/break/break.cpp */ /** The possible types of text boundaries. @stable ICU 2.0 */ typedef enum UBreakIteratorType { /** Character breaks @stable ICU 2.0 */ UBRK_CHARACTER = 0, /** Word breaks @stable ICU 2.0 */ UBRK_WORD = 1, /** Line breaks @stable ICU 2.0 */ UBRK_LINE = 2, /** Sentence breaks @stable ICU 2.0 */ UBRK_SENTENCE = 3, } UBreakIteratorType; /** Value indicating all text boundaries have been returned. * @stable ICU 2.0 */ #define UBRK_DONE ((int32_t) -1) /** * Enum constants for the word break tags returned by * getRuleStatus(). A range of values is defined for each category of * word, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * * The numeric values of all of these constants are stable (will not change). * * @stable ICU 2.2 */ typedef enum UWordBreak { /** Tag value for "words" that do not fit into any of other categories. * Includes spaces and most punctuation. */ UBRK_WORD_NONE = 0, /** Upper bound for tags for uncategorized words. */ UBRK_WORD_NONE_LIMIT = 100, /** Tag value for words that appear to be numbers, lower limit. */ UBRK_WORD_NUMBER = 100, /** Tag value for words that appear to be numbers, upper limit. */ UBRK_WORD_NUMBER_LIMIT = 200, /** Tag value for words that contain letters, excluding * hiragana, katakana or ideographic characters, lower limit. */ UBRK_WORD_LETTER = 200, /** Tag value for words containing letters, upper limit */ UBRK_WORD_LETTER_LIMIT = 300, /** Tag value for words containing kana characters, lower limit */ UBRK_WORD_KANA = 300, /** Tag value for words containing kana characters, upper limit */ UBRK_WORD_KANA_LIMIT = 400, /** Tag value for words containing ideographic characters, lower limit */ UBRK_WORD_IDEO = 400, /** Tag value for words containing ideographic characters, upper limit */ UBRK_WORD_IDEO_LIMIT = 500 } UWordBreak; /** * Enum constants for the line break tags returned by getRuleStatus(). * A range of values is defined for each category of * word, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * * The numeric values of all of these constants are stable (will not change). * * @stable ICU 2.8 */ typedef enum ULineBreakTag { /** Tag value for soft line breaks, positions at which a line break * is acceptable but not required */ UBRK_LINE_SOFT = 0, /** Upper bound for soft line breaks. */ UBRK_LINE_SOFT_LIMIT = 100, /** Tag value for a hard, or mandatory line break */ UBRK_LINE_HARD = 100, /** Upper bound for hard line breaks. */ UBRK_LINE_HARD_LIMIT = 200 } ULineBreakTag; /** * Enum constants for the sentence break tags returned by getRuleStatus(). * A range of values is defined for each category of * sentence, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * * The numeric values of all of these constants are stable (will not change). * * @stable ICU 2.8 */ typedef enum USentenceBreakTag { /** Tag value for for sentences ending with a sentence terminator * ('.', '?', '!', etc.) character, possibly followed by a * hard separator (CR, LF, PS, etc.) */ UBRK_SENTENCE_TERM = 0, /** Upper bound for tags for sentences ended by sentence terminators. */ UBRK_SENTENCE_TERM_LIMIT = 100, /** Tag value for for sentences that do not contain an ending * sentence terminator ('.', '?', '!', etc.) character, but * are ended only by a hard separator (CR, LF, PS, etc.) or end of input. */ UBRK_SENTENCE_SEP = 100, /** Upper bound for tags for sentences ended by a separator. */ UBRK_SENTENCE_SEP_LIMIT = 200 /** Tag value for a hard, or mandatory line break */ } USentenceBreakTag; /** * Open a new UBreakIterator for locating text boundaries for a specified locale. * A UBreakIterator may be used for detecting character, line, word, * and sentence breaks in text. * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD, * UBRK_LINE, UBRK_SENTENCE * @param locale The locale specifying the text-breaking conventions. Note that * locale keys such as "lb" and "ss" may be used to modify text break behavior, * see general discussion of BreakIterator C API. * @param text The text to be iterated over. May be null, in which case ubrk_setText() is * used to specify the text to be iterated. * @param textLength The number of characters in text, or -1 if null-terminated. * @param status A UErrorCode to receive any errors. * @return A UBreakIterator for the specified locale. * @see ubrk_openRules * @stable ICU 2.0 */ U_CAPI UBreakIterator* U_EXPORT2 ubrk_open(UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status); /** * Open a new UBreakIterator for locating text boundaries using specified breaking rules. * The rule syntax is ... (TBD) * @param rules A set of rules specifying the text breaking conventions. * @param rulesLength The number of characters in rules, or -1 if null-terminated. * @param text The text to be iterated over. May be null, in which case ubrk_setText() is * used to specify the text to be iterated. * @param textLength The number of characters in text, or -1 if null-terminated. * @param parseErr Receives position and context information for any syntax errors * detected while parsing the rules. * @param status A UErrorCode to receive any errors. * @return A UBreakIterator for the specified rules. * @see ubrk_open * @stable ICU 2.2 */ U_CAPI UBreakIterator* U_EXPORT2 ubrk_openRules(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status); #if (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Open a new UBreakIterator for locating text boundaries using precompiled binary rules. * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules. * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not * compatible across different major versions of ICU, nor across platforms of different * endianness or different base character set family (ASCII vs EBCDIC). * @param binaryRules A set of compiled binary rules specifying the text breaking * conventions. Ownership of the storage containing the compiled * rules remains with the caller of this function. The compiled * rules must not be modified or deleted during the life of the * break iterator. * @param rulesLength The length of binaryRules in bytes; must be >= 0. * @param text The text to be iterated over. May be null, in which case * ubrk_setText() is used to specify the text to be iterated. * @param textLength The number of characters in text, or -1 if null-terminated. * @param status Pointer to UErrorCode to receive any errors. * @return UBreakIterator for the specified rules. * @see ubrk_getBinaryRules * @stable ICU 59 */ U_CAPI UBreakIterator* U_EXPORT2 ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength, const UChar * text, int32_t textLength, UErrorCode * status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Thread safe cloning operation * @param bi iterator to be cloned * @param stackBuffer Deprecated functionality as of ICU 52, use NULL.
* user allocated space for the new clone. If NULL new memory will be allocated. * If buffer is not large enough, new memory will be allocated. * Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. * @param pBufferSize Deprecated functionality as of ICU 52, use NULL or 1.
* pointer to size of allocated space. * If *pBufferSize == 0, a sufficient size for use in cloning will * be returned ('pre-flighting') * If *pBufferSize is not enough for a stack-based safe clone, * new memory will be allocated. * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. * @return pointer to the new clone * @stable ICU 2.0 */ U_CAPI UBreakIterator * U_EXPORT2 ubrk_safeClone( const UBreakIterator *bi, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Thread safe cloning operation * @param bi iterator to be cloned * @param status to indicate whether the operation went on smoothly or there were errors * @return pointer to the new clone * @stable ICU 69 */ U_CAPI UBreakIterator * U_EXPORT2 ubrk_clone(const UBreakIterator *bi, UErrorCode *status); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Close a UBreakIterator. * Once closed, a UBreakIterator may no longer be used. * @param bi The break iterator to close. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubrk_close(UBreakIterator *bi); /** * Sets an existing iterator to point to a new piece of text. * The break iterator retains a pointer to the supplied text. * The caller must not modify or delete the text while the BreakIterator * retains the reference. * * @param bi The iterator to use * @param text The text to be set * @param textLength The length of the text * @param status The error code * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ubrk_setText(UBreakIterator* bi, const UChar* text, int32_t textLength, UErrorCode* status); /** * Sets an existing iterator to point to a new piece of text. * * All index positions returned by break iterator functions are * native indices from the UText. For example, when breaking UTF-8 * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc. * will be UTF-8 string indices, not UTF-16 positions. * * @param bi The iterator to use * @param text The text to be set. * This function makes a shallow clone of the supplied UText. This means * that the caller is free to immediately close or otherwise reuse the * UText that was passed as a parameter, but that the underlying text itself * must not be altered while being referenced by the break iterator. * @param status The error code * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ubrk_setUText(UBreakIterator* bi, UText* text, UErrorCode* status); /** * Determine the most recently-returned text boundary. * * @param bi The break iterator to use. * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous, * \ref ubrk_first, or \ref ubrk_last. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubrk_current(const UBreakIterator *bi); /** * Advance the iterator to the boundary following the current boundary. * * @param bi The break iterator to use. * @return The character index of the next text boundary, or UBRK_DONE * if all text boundaries have been returned. * @see ubrk_previous * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubrk_next(UBreakIterator *bi); /** * Set the iterator position to the boundary preceding the current boundary. * * @param bi The break iterator to use. * @return The character index of the preceding text boundary, or UBRK_DONE * if all text boundaries have been returned. * @see ubrk_next * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubrk_previous(UBreakIterator *bi); /** * Set the iterator position to zero, the start of the text being scanned. * @param bi The break iterator to use. * @return The new iterator position (zero). * @see ubrk_last * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubrk_first(UBreakIterator *bi); /** * Set the iterator position to the index immediately beyond the last character in the text being scanned. * This is not the same as the last character. * @param bi The break iterator to use. * @return The character offset immediately beyond the last character in the * text being scanned. * @see ubrk_first * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubrk_last(UBreakIterator *bi); /** * Set the iterator position to the first boundary preceding the specified offset. * The new position is always smaller than offset, or UBRK_DONE. * @param bi The break iterator to use. * @param offset The offset to begin scanning. * @return The text boundary preceding offset, or UBRK_DONE. * @see ubrk_following * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubrk_preceding(UBreakIterator *bi, int32_t offset); /** * Advance the iterator to the first boundary following the specified offset. * The value returned is always greater than offset, or UBRK_DONE. * @param bi The break iterator to use. * @param offset The offset to begin scanning. * @return The text boundary following offset, or UBRK_DONE. * @see ubrk_preceding * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubrk_following(UBreakIterator *bi, int32_t offset); /** * Get a locale for which text breaking information is available. * A UBreakIterator in a locale returned by this function will perform the correct * text breaking for the locale. * @param index The index of the desired locale. * @return A locale for which number text breaking information is available, or 0 if none. * @see ubrk_countAvailable * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 ubrk_getAvailable(int32_t index); /** * Determine how many locales have text breaking information available. * This function is most useful as determining the loop ending condition for * calls to \ref ubrk_getAvailable. * @return The number of locales for which text breaking information is available. * @see ubrk_getAvailable * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ubrk_countAvailable(void); /** * Returns true if the specified position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at * or after "offset". * @param bi The break iterator to use. * @param offset the offset to check. * @return True if "offset" is a boundary position. * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ubrk_isBoundary(UBreakIterator *bi, int32_t offset); /** * Return the status from the break rule that determined the most recently * returned break position. The values appear in the rule source * within brackets, {123}, for example. For rules that do not specify a * status, a default value of 0 is returned. *

* For word break iterators, the possible values are defined in enum UWordBreak. * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 ubrk_getRuleStatus(UBreakIterator *bi); /** * Get the statuses from the break rules that determined the most recently * returned break position. The values appear in the rule source * within brackets, {123}, for example. The default status value for rules * that do not explicitly provide one is zero. *

* For word break iterators, the possible values are defined in enum UWordBreak. * @param bi The break iterator to use * @param fillInVec an array to be filled in with the status values. * @param capacity the length of the supplied vector. A length of zero causes * the function to return the number of status values, in the * normal way, without attempting to store any values. * @param status receives error codes. * @return The number of rule status values from rules that determined * the most recent boundary returned by the break iterator. * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status); /** * Return the locale of the break iterator. You can choose between the valid and * the actual locale. * @param bi break iterator * @param type locale type (valid or actual) * @param status error code * @return locale string * @stable ICU 2.8 */ U_CAPI const char* U_EXPORT2 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status); /** * Set the subject text string upon which the break iterator is operating * without changing any other aspect of the state. * The new and previous text strings must have the same content. * * This function is intended for use in environments where ICU is operating on * strings that may move around in memory. It provides a mechanism for notifying * ICU that the string has been relocated, and providing a new UText to access the * string in its new position. * * Note that the break iterator never copies the underlying text * of a string being processed, but always operates directly on the original text * provided by the user. Refreshing simply drops the references to the old text * and replaces them with references to the new. * * Caution: this function is normally used only by very specialized * system-level code. One example use case is with garbage collection * that moves the text in memory. * * @param bi The break iterator. * @param text The new (moved) text string. * @param status Receives errors detected by this function. * * @stable ICU 49 */ U_CAPI void U_EXPORT2 ubrk_refreshUText(UBreakIterator *bi, UText *text, UErrorCode *status); #if (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator. * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator * more quickly than using ubrk_openRules. The compiled rules are not compatible across * different major versions of ICU, nor across platforms of different endianness or * different base character set family (ASCII vs EBCDIC). Supports preflighting (with * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to * the binaryRules buffer. However, whether preflighting or not, if the actual length * is greater than INT32_MAX, then the function returns 0 and sets *status to * U_INDEX_OUTOFBOUNDS_ERROR. * @param bi The break iterator to use. * @param binaryRules Buffer to receive the compiled binary rules; set to NULL for * preflighting. * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for * preflighting. Must be >= 0. * @param status Pointer to UErrorCode to receive any errors, such as * U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or * U_ILLEGAL_ARGUMENT_ERROR. * @return The actual byte length of the binary rules, if <= INT32_MAX; * otherwise 0. If not preflighting and this is larger than * rulesCapacity, *status will be set to an error. * @see ubrk_openBinaryRules * @stable ICU 59 */ U_CAPI int32_t U_EXPORT2 ubrk_getBinaryRules(UBreakIterator *bi, uint8_t * binaryRules, int32_t rulesCapacity, UErrorCode * status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5) #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ #endif // icudataver.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 2009-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** */ /** * \file * \brief C API: access to ICU Data Version number */ #ifndef __ICU_DATA_VER_H__ #define __ICU_DATA_VER_H__ /** * @stable ICU 49 */ #define U_ICU_VERSION_BUNDLE "icuver" /** * @stable ICU 49 */ #define U_ICU_DATA_KEY "DataVersion" /** * Retrieves the data version from icuver and stores it in dataVersionFillin. * * @param dataVersionFillin icuver data version information to be filled in if not-null * @param status stores the error code from the calls to resource bundle * * @stable ICU 49 */ U_CAPI void U_EXPORT2 u_getDataVersion(UVersionInfo dataVersionFillin, UErrorCode *status); #endif // alphaindex.h // No supported content // basictz.h // No supported content // calendar.h // No supported content // choicfmt.h // No supported content // coleitr.h // No supported content // coll.h // No supported content // compactdecimalformat.h // No supported content // curramt.h // No supported content // currpinf.h // No supported content // currunit.h // No supported content // datefmt.h // No supported content // dcfmtsym.h // No supported content // decimfmt.h // No supported content // displayoptions.h // No supported content // dtfmtsym.h // No supported content // dtitvfmt.h // No supported content // dtitvinf.h // No supported content // dtptngen.h // No supported content // dtrule.h // No supported content // fieldpos.h // No supported content // fmtable.h // No supported content // format.h // No supported content // formattedvalue.h // No supported content // fpositer.h // No supported content // gender.h // No supported content // gregocal.h // No supported content // listformatter.h // No supported content // measfmt.h // No supported content // measunit.h // No supported content // measure.h // No supported content // msgfmt.h // No supported content // nounit.h // No supported content // numberformatter.h // No supported content // numberrangeformatter.h // No supported content // numfmt.h // No supported content // numsys.h // No supported content // plurfmt.h // No supported content // plurrule.h // No supported content // rbnf.h // No supported content // rbtz.h // No supported content // regex.h // No supported content // region.h // No supported content // reldatefmt.h // No supported content // scientificnumberformatter.h // No supported content // search.h // No supported content // selfmt.h // No supported content // simpletz.h // No supported content // smpdtfmt.h // No supported content // sortkey.h // No supported content // stsearch.h // No supported content // tblcoll.h // No supported content // timezone.h // No supported content // tmunit.h // No supported content // tmutamt.h // No supported content // tmutfmt.h // No supported content // translit.h // No supported content // tzfmt.h // No supported content // tznames.h // No supported content // tzrule.h // No supported content // tztrans.h // No supported content // ucal.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1996-2015, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #ifndef UCAL_H #define UCAL_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Calendar * *

Calendar C API

* * UCalendar C API is used for converting between a UDate object * and a set of integer fields such as UCAL_YEAR, UCAL_MONTH, * UCAL_DAY, UCAL_HOUR, and so on. * (A UDate object represents a specific instant in * time with millisecond precision. See UDate * for information about the UDate .) * *

* Types of UCalendar interpret a UDate * according to the rules of a specific calendar system. The C API * provides the enum UCalendarType with UCAL_TRADITIONAL and * UCAL_GREGORIAN. *

* Like other locale-sensitive C API, calendar API provides a * function, ucal_open(), which returns a pointer to * UCalendar whose time fields have been initialized * with the current date and time. We need to specify the type of * calendar to be opened and the timezoneId. * \htmlonly

\endhtmlonly *
 * \code
 * UCalendar *caldef;
 * UChar *tzId;
 * UErrorCode status;
 * tzId=(UChar*)malloc(sizeof(UChar) * (strlen("PST") +1) );
 * u_uastrcpy(tzId, "PST");
 * caldef=ucal_open(tzID, u_strlen(tzID), NULL, UCAL_TRADITIONAL, &status);
 * \endcode
 * 
* \htmlonly
\endhtmlonly * *

* A UCalendar object can produce all the time field values * needed to implement the date-time formatting for a particular language * and calendar style (for example, Japanese-Gregorian, Japanese-Traditional). * *

* When computing a UDate from time fields, two special circumstances * may arise: there may be insufficient information to compute the * UDate (such as only year and month but no day in the month), * or there may be inconsistent information (such as "Tuesday, July 15, 1996" * -- July 15, 1996 is actually a Monday). * *

* Insufficient information. The calendar will use default * information to specify the missing fields. This may vary by calendar; for * the Gregorian calendar, the default for a field is the same as that of the * start of the epoch: i.e., UCAL_YEAR = 1970, UCAL_MONTH = JANUARY, UCAL_DATE = 1, etc. * *

* Inconsistent information. If fields conflict, the calendar * will give preference to fields set more recently. For example, when * determining the day, the calendar will look for one of the following * combinations of fields. The most recent combination, as determined by the * most recently set single field, will be used. * * \htmlonly

\endhtmlonly *
 * \code
 * UCAL_MONTH + UCAL_DAY_OF_MONTH
 * UCAL_MONTH + UCAL_WEEK_OF_MONTH + UCAL_DAY_OF_WEEK
 * UCAL_MONTH + UCAL_DAY_OF_WEEK_IN_MONTH + UCAL_DAY_OF_WEEK
 * UCAL_DAY_OF_YEAR
 * UCAL_DAY_OF_WEEK + UCAL_WEEK_OF_YEAR
 * \endcode
 * 
* \htmlonly
\endhtmlonly * * For the time of day: * * \htmlonly
\endhtmlonly *
 * \code
 * UCAL_HOUR_OF_DAY
 * UCAL_AM_PM + UCAL_HOUR
 * \endcode
 * 
* \htmlonly
\endhtmlonly * *

* Note: for some non-Gregorian calendars, different * fields may be necessary for complete disambiguation. For example, a full * specification of the historical Arabic astronomical calendar requires year, * month, day-of-month and day-of-week in some cases. * *

* Note: There are certain possible ambiguities in * interpretation of certain singular times, which are resolved in the * following ways: *

    *
  1. 24:00:00 "belongs" to the following day. That is, * 23:59 on Dec 31, 1969 < 24:00 on Jan 1, 1970 < 24:01:00 on Jan 1, 1970 * *
  2. Although historically not precise, midnight also belongs to "am", * and noon belongs to "pm", so on the same day, * 12:00 am (midnight) < 12:01 am, and 12:00 pm (noon) < 12:01 pm *
* *

* The date or time format strings are not part of the definition of a * calendar, as those must be modifiable or overridable by the user at * runtime. Use {@link icu::DateFormat} * to format dates. * *

* Calendar provides an API for field "rolling", where fields * can be incremented or decremented, but wrap around. For example, rolling the * month up in the date December 12, 1996 results in * January 12, 1996. * *

* Calendar also provides a date arithmetic function for * adding the specified (signed) amount of time to a particular time field. * For example, subtracting 5 days from the date September 12, 1996 * results in September 7, 1996. * *

* The Japanese calendar uses a combination of era name and year number. * When an emperor of Japan abdicates and a new emperor ascends the throne, * a new era is declared and year number is reset to 1. Even if the date of * abdication is scheduled ahead of time, the new era name might not be * announced until just before the date. In such case, ICU4C may include * a start date of future era without actual era name, but not enabled * by default. ICU4C users who want to test the behavior of the future era * can enable the tentative era by: *

    *
  • Environment variable ICU_ENABLE_TENTATIVE_ERA=true.
  • *
* * @stable ICU 2.0 */ /** * The time zone ID reserved for unknown time zone. * It behaves like the GMT/UTC time zone but has the special ID "Etc/Unknown". * @stable ICU 4.8 */ #define UCAL_UNKNOWN_ZONE_ID "Etc/Unknown" /** A calendar. * For usage in C programs. * @stable ICU 2.0 */ typedef void* UCalendar; /** Possible types of UCalendars * @stable ICU 2.0 */ enum UCalendarType { /** * Despite the name, UCAL_TRADITIONAL designates the locale's default calendar, * which may be the Gregorian calendar or some other calendar. * @stable ICU 2.0 */ UCAL_TRADITIONAL, /** * A better name for UCAL_TRADITIONAL. * @stable ICU 4.2 */ UCAL_DEFAULT = UCAL_TRADITIONAL, /** * Unambiguously designates the Gregorian calendar for the locale. * @stable ICU 2.0 */ UCAL_GREGORIAN }; /** @stable ICU 2.0 */ typedef enum UCalendarType UCalendarType; /** Possible fields in a UCalendar * @stable ICU 2.0 */ enum UCalendarDateFields { /** * Field number indicating the era, e.g., AD or BC in the Gregorian (Julian) calendar. * This is a calendar-specific value. * @stable ICU 2.6 */ UCAL_ERA, /** * Field number indicating the year. This is a calendar-specific value. * @stable ICU 2.6 */ UCAL_YEAR, /** * Field number indicating the month. This is a calendar-specific value. * The first month of the year is * JANUARY; the last depends on the number of months in a year. * @see #UCAL_JANUARY * @see #UCAL_FEBRUARY * @see #UCAL_MARCH * @see #UCAL_APRIL * @see #UCAL_MAY * @see #UCAL_JUNE * @see #UCAL_JULY * @see #UCAL_AUGUST * @see #UCAL_SEPTEMBER * @see #UCAL_OCTOBER * @see #UCAL_NOVEMBER * @see #UCAL_DECEMBER * @see #UCAL_UNDECIMBER * @stable ICU 2.6 */ UCAL_MONTH, /** * Field number indicating the * week number within the current year. The first week of the year, as * defined by UCAL_FIRST_DAY_OF_WEEK and UCAL_MINIMAL_DAYS_IN_FIRST_WEEK * attributes, has value 1. Subclasses define * the value of UCAL_WEEK_OF_YEAR for days before the first week of * the year. * @see ucal_getAttribute * @see ucal_setAttribute * @stable ICU 2.6 */ UCAL_WEEK_OF_YEAR, /** * Field number indicating the * week number within the current month. The first week of the month, as * defined by UCAL_FIRST_DAY_OF_WEEK and UCAL_MINIMAL_DAYS_IN_FIRST_WEEK * attributes, has value 1. Subclasses define * the value of WEEK_OF_MONTH for days before the first week of * the month. * @see ucal_getAttribute * @see ucal_setAttribute * @see #UCAL_FIRST_DAY_OF_WEEK * @see #UCAL_MINIMAL_DAYS_IN_FIRST_WEEK * @stable ICU 2.6 */ UCAL_WEEK_OF_MONTH, /** * Field number indicating the * day of the month. This is a synonym for DAY_OF_MONTH. * The first day of the month has value 1. * @see #UCAL_DAY_OF_MONTH * @stable ICU 2.6 */ UCAL_DATE, /** * Field number indicating the day * number within the current year. The first day of the year has value 1. * @stable ICU 2.6 */ UCAL_DAY_OF_YEAR, /** * Field number indicating the day * of the week. This field takes values SUNDAY, * MONDAY, TUESDAY, WEDNESDAY, * THURSDAY, FRIDAY, and SATURDAY. * @see #UCAL_SUNDAY * @see #UCAL_MONDAY * @see #UCAL_TUESDAY * @see #UCAL_WEDNESDAY * @see #UCAL_THURSDAY * @see #UCAL_FRIDAY * @see #UCAL_SATURDAY * @stable ICU 2.6 */ UCAL_DAY_OF_WEEK, /** * Field number indicating the * ordinal number of the day of the week within the current month. Together * with the DAY_OF_WEEK field, this uniquely specifies a day * within a month. Unlike WEEK_OF_MONTH and * WEEK_OF_YEAR, this field's value does not depend on * getFirstDayOfWeek() or * getMinimalDaysInFirstWeek(). DAY_OF_MONTH 1 * through 7 always correspond to DAY_OF_WEEK_IN_MONTH * 1; 8 through 15 correspond to * DAY_OF_WEEK_IN_MONTH 2, and so on. * DAY_OF_WEEK_IN_MONTH 0 indicates the week before * DAY_OF_WEEK_IN_MONTH 1. Negative values count back from the * end of the month, so the last Sunday of a month is specified as * DAY_OF_WEEK = SUNDAY, DAY_OF_WEEK_IN_MONTH = -1. Because * negative values count backward they will usually be aligned differently * within the month than positive values. For example, if a month has 31 * days, DAY_OF_WEEK_IN_MONTH -1 will overlap * DAY_OF_WEEK_IN_MONTH 5 and the end of 4. * @see #UCAL_DAY_OF_WEEK * @see #UCAL_WEEK_OF_MONTH * @stable ICU 2.6 */ UCAL_DAY_OF_WEEK_IN_MONTH, /** * Field number indicating * whether the HOUR is before or after noon. * E.g., at 10:04:15.250 PM the AM_PM is PM. * @see #UCAL_AM * @see #UCAL_PM * @see #UCAL_HOUR * @stable ICU 2.6 */ UCAL_AM_PM, /** * Field number indicating the * hour of the morning or afternoon. HOUR is used for the 12-hour * clock. * E.g., at 10:04:15.250 PM the HOUR is 10. * @see #UCAL_AM_PM * @see #UCAL_HOUR_OF_DAY * @stable ICU 2.6 */ UCAL_HOUR, /** * Field number indicating the * hour of the day. HOUR_OF_DAY is used for the 24-hour clock. * E.g., at 10:04:15.250 PM the HOUR_OF_DAY is 22. * @see #UCAL_HOUR * @stable ICU 2.6 */ UCAL_HOUR_OF_DAY, /** * Field number indicating the * minute within the hour. * E.g., at 10:04:15.250 PM the UCAL_MINUTE is 4. * @stable ICU 2.6 */ UCAL_MINUTE, /** * Field number indicating the * second within the minute. * E.g., at 10:04:15.250 PM the UCAL_SECOND is 15. * @stable ICU 2.6 */ UCAL_SECOND, /** * Field number indicating the * millisecond within the second. * E.g., at 10:04:15.250 PM the UCAL_MILLISECOND is 250. * @stable ICU 2.6 */ UCAL_MILLISECOND, /** * Field number indicating the * raw offset from GMT in milliseconds. * @stable ICU 2.6 */ UCAL_ZONE_OFFSET, /** * Field number indicating the * daylight savings offset in milliseconds. * @stable ICU 2.6 */ UCAL_DST_OFFSET, /** * Field number * indicating the extended year corresponding to the * UCAL_WEEK_OF_YEAR field. This may be one greater or less * than the value of UCAL_EXTENDED_YEAR. * @stable ICU 2.6 */ UCAL_YEAR_WOY, /** * Field number * indicating the localized day of week. This will be a value from 1 * to 7 inclusive, with 1 being the localized first day of the week. * @stable ICU 2.6 */ UCAL_DOW_LOCAL, /** * Year of this calendar system, encompassing all supra-year fields. For example, * in Gregorian/Julian calendars, positive Extended Year values indicate years AD, * 1 BC = 0 extended, 2 BC = -1 extended, and so on. * @stable ICU 2.8 */ UCAL_EXTENDED_YEAR, /** * Field number * indicating the modified Julian day number. This is different from * the conventional Julian day number in two regards. First, it * demarcates days at local zone midnight, rather than noon GMT. * Second, it is a local number; that is, it depends on the local time * zone. It can be thought of as a single number that encompasses all * the date-related fields. * @stable ICU 2.8 */ UCAL_JULIAN_DAY, /** * Ranges from 0 to 23:59:59.999 (regardless of DST). This field behaves exactly * like a composite of all time-related fields, not including the zone fields. As such, * it also reflects discontinuities of those fields on DST transition days. On a day * of DST onset, it will jump forward. On a day of DST cessation, it will jump * backward. This reflects the fact that it must be combined with the DST_OFFSET field * to obtain a unique local time value. * @stable ICU 2.8 */ UCAL_MILLISECONDS_IN_DAY, /** * Whether or not the current month is a leap month (0 or 1). See the Chinese calendar for * an example of this. */ UCAL_IS_LEAP_MONTH, /* Do not conditionalize the following with #ifndef U_HIDE_DEPRECATED_API, * it is needed for layout of Calendar, DateFormat, and other objects */ #ifndef U_FORCE_HIDE_DEPRECATED_API /** * One more than the highest normal UCalendarDateFields value. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ UCAL_FIELD_COUNT, #endif // U_FORCE_HIDE_DEPRECATED_API /** * Field number indicating the * day of the month. This is a synonym for UCAL_DATE. * The first day of the month has value 1. * @see #UCAL_DATE * Synonym for UCAL_DATE * @stable ICU 2.8 **/ UCAL_DAY_OF_MONTH=UCAL_DATE }; /** @stable ICU 2.0 */ typedef enum UCalendarDateFields UCalendarDateFields; /** * Useful constant for days of week. Note: Calendar day-of-week is 1-based. Clients * who create locale resources for the field of first-day-of-week should be aware of * this. For instance, in US locale, first-day-of-week is set to 1, i.e., UCAL_SUNDAY. */ /** Possible days of the week in a UCalendar * @stable ICU 2.0 */ enum UCalendarDaysOfWeek { /** Sunday */ UCAL_SUNDAY = 1, /** Monday */ UCAL_MONDAY, /** Tuesday */ UCAL_TUESDAY, /** Wednesday */ UCAL_WEDNESDAY, /** Thursday */ UCAL_THURSDAY, /** Friday */ UCAL_FRIDAY, /** Saturday */ UCAL_SATURDAY }; /** @stable ICU 2.0 */ typedef enum UCalendarDaysOfWeek UCalendarDaysOfWeek; /** Possible months in a UCalendar. Note: Calendar month is 0-based. * @stable ICU 2.0 */ enum UCalendarMonths { /** January */ UCAL_JANUARY, /** February */ UCAL_FEBRUARY, /** March */ UCAL_MARCH, /** April */ UCAL_APRIL, /** May */ UCAL_MAY, /** June */ UCAL_JUNE, /** July */ UCAL_JULY, /** August */ UCAL_AUGUST, /** September */ UCAL_SEPTEMBER, /** October */ UCAL_OCTOBER, /** November */ UCAL_NOVEMBER, /** December */ UCAL_DECEMBER, /** Value of the UCAL_MONTH field indicating the * thirteenth month of the year. Although the Gregorian calendar * does not use this value, lunar calendars do. */ UCAL_UNDECIMBER }; /** @stable ICU 2.0 */ typedef enum UCalendarMonths UCalendarMonths; /** Possible AM/PM values in a UCalendar * @stable ICU 2.0 */ enum UCalendarAMPMs { /** AM */ UCAL_AM, /** PM */ UCAL_PM }; /** @stable ICU 2.0 */ typedef enum UCalendarAMPMs UCalendarAMPMs; /** * System time zone type constants used by filtering zones * in ucal_openTimeZoneIDEnumeration. * @see ucal_openTimeZoneIDEnumeration * @stable ICU 4.8 */ enum USystemTimeZoneType { /** * Any system zones. * @stable ICU 4.8 */ UCAL_ZONE_TYPE_ANY, /** * Canonical system zones. * @stable ICU 4.8 */ UCAL_ZONE_TYPE_CANONICAL, /** * Canonical system zones associated with actual locations. * @stable ICU 4.8 */ UCAL_ZONE_TYPE_CANONICAL_LOCATION }; /** @stable ICU 4.8 */ typedef enum USystemTimeZoneType USystemTimeZoneType; /** * Create an enumeration over system time zone IDs with the given * filter conditions. * @param zoneType The system time zone type. * @param region The ISO 3166 two-letter country code or UN M.49 * three-digit area code. When NULL, no filtering * done by region. * @param rawOffset An offset from GMT in milliseconds, ignoring the * effect of daylight savings time, if any. When NULL, * no filtering done by zone offset. * @param ec A pointer to an UErrorCode to receive any errors * @return an enumeration object that the caller must dispose of * using enum_close(), or NULL upon failure. In case of failure, * *ec will indicate the error. * @stable ICU 4.8 */ U_CAPI UEnumeration* U_EXPORT2 ucal_openTimeZoneIDEnumeration(USystemTimeZoneType zoneType, const char* region, const int32_t* rawOffset, UErrorCode* ec); /** * Create an enumeration over all time zones. * * @param ec input/output error code * * @return an enumeration object that the caller must dispose of using * uenum_close(), or NULL upon failure. In case of failure *ec will * indicate the error. * * @stable ICU 2.6 */ U_CAPI UEnumeration* U_EXPORT2 ucal_openTimeZones(UErrorCode* ec); /** * Create an enumeration over all time zones associated with the given * country. Some zones are affiliated with no country (e.g., "UTC"); * these may also be retrieved, as a group. * * @param country the ISO 3166 two-letter country code, or NULL to * retrieve zones not affiliated with any country * * @param ec input/output error code * * @return an enumeration object that the caller must dispose of using * uenum_close(), or NULL upon failure. In case of failure *ec will * indicate the error. * * @stable ICU 2.6 */ U_CAPI UEnumeration* U_EXPORT2 ucal_openCountryTimeZones(const char* country, UErrorCode* ec); /** * Return the default time zone. The default is determined initially * by querying the host operating system. If the host system detection * routines fail, or if they specify a TimeZone or TimeZone offset * which is not recognized, then the special TimeZone "Etc/Unknown" * is returned. * * The default may be changed with `ucal_setDefaultTimeZone()` or with * the C++ TimeZone API, `TimeZone::adoptDefault(TimeZone*)`. * * @param result A buffer to receive the result, or NULL * * @param resultCapacity The capacity of the result buffer * * @param ec input/output error code * * @return The result string length, not including the terminating * null * * @see #UCAL_UNKNOWN_ZONE_ID * * @stable ICU 2.6 */ U_CAPI int32_t U_EXPORT2 ucal_getDefaultTimeZone(UChar* result, int32_t resultCapacity, UErrorCode* ec); /** * Set the default time zone. * * @param zoneID null-terminated time zone ID * * @param ec input/output error code * * @stable ICU 2.6 */ U_CAPI void U_EXPORT2 ucal_setDefaultTimeZone(const UChar* zoneID, UErrorCode* ec); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Return the current host time zone. The host time zone is detected from * the current host system configuration by querying the host operating * system. If the host system detection routines fail, or if they specify * a TimeZone or TimeZone offset which is not recognized, then the special * TimeZone "Etc/Unknown" is returned. * * Note that host time zone and the ICU default time zone can be different. * * The ICU default time zone does not change once initialized unless modified * by calling `ucal_setDefaultTimeZone()` or with the C++ TimeZone API, * `TimeZone::adoptDefault(TimeZone*)`. * * If the host operating system configuration has changed since ICU has * initialized then the returned value can be different than the ICU default * time zone, even if the default has not changed. * *

This function is not thread safe.

* * @param result A buffer to receive the result, or NULL * @param resultCapacity The capacity of the result buffer * @param ec input/output error code * @return The result string length, not including the terminating * null * * @see #UCAL_UNKNOWN_ZONE_ID * * @stable ICU 65 */ U_CAPI int32_t U_EXPORT2 ucal_getHostTimeZone(UChar *result, int32_t resultCapacity, UErrorCode *ec); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Return the amount of time in milliseconds that the clock is * advanced during daylight savings time for the given time zone, or * zero if the time zone does not observe daylight savings time. * * @param zoneID null-terminated time zone ID * * @param ec input/output error code * * @return the number of milliseconds the time is advanced with * respect to standard time when the daylight savings rules are in * effect. This is always a non-negative number, most commonly either * 3,600,000 (one hour) or zero. * * @stable ICU 2.6 */ U_CAPI int32_t U_EXPORT2 ucal_getDSTSavings(const UChar* zoneID, UErrorCode* ec); /** * Get the current date and time. * The value returned is represented as milliseconds from the epoch. * @return The current date and time. * @stable ICU 2.0 */ U_CAPI UDate U_EXPORT2 ucal_getNow(void); /** * Open a UCalendar. * A UCalendar may be used to convert a millisecond value to a year, * month, and day. *

* Note: When unknown TimeZone ID is specified or if the TimeZone ID specified is "Etc/Unknown", * the UCalendar returned by the function is initialized with GMT zone with TimeZone ID * UCAL_UNKNOWN_ZONE_ID ("Etc/Unknown") without any errors/warnings. If you want * to check if a TimeZone ID is valid prior to this function, use ucal_getCanonicalTimeZoneID. * * @param zoneID The desired TimeZone ID. If 0, use the default time zone. * @param len The length of zoneID, or -1 if null-terminated. * @param locale The desired locale * @param type The type of UCalendar to open. This can be UCAL_GREGORIAN to open the Gregorian * calendar for the locale, or UCAL_DEFAULT to open the default calendar for the locale (the * default calendar may also be Gregorian). To open a specific non-Gregorian calendar for the * locale, use uloc_setKeywordValue to set the value of the calendar keyword for the locale * and then pass the locale to ucal_open with UCAL_DEFAULT as the type. * @param status A pointer to an UErrorCode to receive any errors * @return A pointer to a UCalendar, or 0 if an error occurred. * @see #UCAL_UNKNOWN_ZONE_ID * @stable ICU 2.0 */ U_CAPI UCalendar* U_EXPORT2 ucal_open(const UChar* zoneID, int32_t len, const char* locale, UCalendarType type, UErrorCode* status); /** * Close a UCalendar. * Once closed, a UCalendar may no longer be used. * @param cal The UCalendar to close. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_close(UCalendar *cal); /** * Open a copy of a UCalendar. * This function performs a deep copy. * @param cal The calendar to copy * @param status A pointer to an UErrorCode to receive any errors. * @return A pointer to a UCalendar identical to cal. * @stable ICU 4.0 */ U_CAPI UCalendar* U_EXPORT2 ucal_clone(const UCalendar* cal, UErrorCode* status); /** * Set the TimeZone used by a UCalendar. * A UCalendar uses a timezone for converting from Greenwich time to local time. * @param cal The UCalendar to set. * @param zoneID The desired TimeZone ID. If 0, use the default time zone. * @param len The length of zoneID, or -1 if null-terminated. * @param status A pointer to an UErrorCode to receive any errors. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_setTimeZone(UCalendar* cal, const UChar* zoneID, int32_t len, UErrorCode* status); /** * Get the ID of the UCalendar's time zone. * * @param cal The UCalendar to query. * @param result Receives the UCalendar's time zone ID. * @param resultLength The maximum size of result. * @param status Receives the status. * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @stable ICU 51 */ U_CAPI int32_t U_EXPORT2 ucal_getTimeZoneID(const UCalendar *cal, UChar *result, int32_t resultLength, UErrorCode *status); /** * Possible formats for a UCalendar's display name * @stable ICU 2.0 */ enum UCalendarDisplayNameType { /** Standard display name */ UCAL_STANDARD, /** Short standard display name */ UCAL_SHORT_STANDARD, /** Daylight savings display name */ UCAL_DST, /** Short daylight savings display name */ UCAL_SHORT_DST }; /** @stable ICU 2.0 */ typedef enum UCalendarDisplayNameType UCalendarDisplayNameType; /** * Get the display name for a UCalendar's TimeZone. * A display name is suitable for presentation to a user. * @param cal The UCalendar to query. * @param type The desired display name format; one of UCAL_STANDARD, UCAL_SHORT_STANDARD, * UCAL_DST, UCAL_SHORT_DST * @param locale The desired locale for the display name. * @param result A pointer to a buffer to receive the formatted number. * @param resultLength The maximum size of result. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucal_getTimeZoneDisplayName(const UCalendar* cal, UCalendarDisplayNameType type, const char* locale, UChar* result, int32_t resultLength, UErrorCode* status); /** * Determine if a UCalendar is currently in daylight savings time. * Daylight savings time is not used in all parts of the world. * @param cal The UCalendar to query. * @param status A pointer to an UErrorCode to receive any errors * @return true if cal is currently in daylight savings time, false otherwise * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ucal_inDaylightTime(const UCalendar* cal, UErrorCode* status ); /** * Sets the GregorianCalendar change date. This is the point when the switch from * Julian dates to Gregorian dates occurred. Default is 00:00:00 local time, October * 15, 1582. Previous to this time and date will be Julian dates. * * This function works only for Gregorian calendars. If the UCalendar is not * an instance of a Gregorian calendar, then a U_UNSUPPORTED_ERROR * error code is set. * * @param cal The calendar object. * @param date The given Gregorian cutover date. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * * @see GregorianCalendar::setGregorianChange * @see ucal_getGregorianChange * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ucal_setGregorianChange(UCalendar *cal, UDate date, UErrorCode *pErrorCode); /** * Gets the Gregorian Calendar change date. This is the point when the switch from * Julian dates to Gregorian dates occurred. Default is 00:00:00 local time, October * 15, 1582. Previous to this time and date will be Julian dates. * * This function works only for Gregorian calendars. If the UCalendar is not * an instance of a Gregorian calendar, then a U_UNSUPPORTED_ERROR * error code is set. * * @param cal The calendar object. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The Gregorian cutover time for this calendar. * * @see GregorianCalendar::getGregorianChange * @see ucal_setGregorianChange * @stable ICU 3.6 */ U_CAPI UDate U_EXPORT2 ucal_getGregorianChange(const UCalendar *cal, UErrorCode *pErrorCode); /** * Types of UCalendar attributes * @stable ICU 2.0 */ enum UCalendarAttribute { /** * Lenient parsing * @stable ICU 2.0 */ UCAL_LENIENT, /** * First day of week * @stable ICU 2.0 */ UCAL_FIRST_DAY_OF_WEEK, /** * Minimum number of days in first week * @stable ICU 2.0 */ UCAL_MINIMAL_DAYS_IN_FIRST_WEEK, /** * The behavior for handling wall time repeating multiple times * at negative time zone offset transitions * @stable ICU 49 */ UCAL_REPEATED_WALL_TIME, /** * The behavior for handling skipped wall time at positive time * zone offset transitions. * @stable ICU 49 */ UCAL_SKIPPED_WALL_TIME }; /** @stable ICU 2.0 */ typedef enum UCalendarAttribute UCalendarAttribute; /** * Options for handling ambiguous wall time at time zone * offset transitions. * @stable ICU 49 */ enum UCalendarWallTimeOption { /** * An ambiguous wall time to be interpreted as the latest. * This option is valid for UCAL_REPEATED_WALL_TIME and * UCAL_SKIPPED_WALL_TIME. * @stable ICU 49 */ UCAL_WALLTIME_LAST, /** * An ambiguous wall time to be interpreted as the earliest. * This option is valid for UCAL_REPEATED_WALL_TIME and * UCAL_SKIPPED_WALL_TIME. * @stable ICU 49 */ UCAL_WALLTIME_FIRST, /** * An ambiguous wall time to be interpreted as the next valid * wall time. This option is valid for UCAL_SKIPPED_WALL_TIME. * @stable ICU 49 */ UCAL_WALLTIME_NEXT_VALID }; /** @stable ICU 49 */ typedef enum UCalendarWallTimeOption UCalendarWallTimeOption; /** * Get a numeric attribute associated with a UCalendar. * Numeric attributes include the first day of the week, or the minimal numbers * of days in the first week of the month. * @param cal The UCalendar to query. * @param attr The desired attribute; one of UCAL_LENIENT, UCAL_FIRST_DAY_OF_WEEK, * UCAL_MINIMAL_DAYS_IN_FIRST_WEEK, UCAL_REPEATED_WALL_TIME or UCAL_SKIPPED_WALL_TIME * @return The value of attr. * @see ucal_setAttribute * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucal_getAttribute(const UCalendar* cal, UCalendarAttribute attr); /** * Set a numeric attribute associated with a UCalendar. * Numeric attributes include the first day of the week, or the minimal numbers * of days in the first week of the month. * @param cal The UCalendar to set. * @param attr The desired attribute; one of UCAL_LENIENT, UCAL_FIRST_DAY_OF_WEEK, * UCAL_MINIMAL_DAYS_IN_FIRST_WEEK, UCAL_REPEATED_WALL_TIME or UCAL_SKIPPED_WALL_TIME * @param newValue The new value of attr. * @see ucal_getAttribute * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_setAttribute(UCalendar* cal, UCalendarAttribute attr, int32_t newValue); /** * Get a locale for which calendars are available. * A UCalendar in a locale returned by this function will contain the correct * day and month names for the locale. * @param localeIndex The index of the desired locale. * @return A locale for which calendars are available, or 0 if none. * @see ucal_countAvailable * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 ucal_getAvailable(int32_t localeIndex); /** * Determine how many locales have calendars available. * This function is most useful as determining the loop ending condition for * calls to \ref ucal_getAvailable. * @return The number of locales for which calendars are available. * @see ucal_getAvailable * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucal_countAvailable(void); /** * Get a UCalendar's current time in millis. * The time is represented as milliseconds from the epoch. * @param cal The UCalendar to query. * @param status A pointer to an UErrorCode to receive any errors * @return The calendar's current time in millis. * @see ucal_setMillis * @see ucal_setDate * @see ucal_setDateTime * @stable ICU 2.0 */ U_CAPI UDate U_EXPORT2 ucal_getMillis(const UCalendar* cal, UErrorCode* status); /** * Set a UCalendar's current time in millis. * The time is represented as milliseconds from the epoch. * @param cal The UCalendar to set. * @param dateTime The desired date and time. * @param status A pointer to an UErrorCode to receive any errors * @see ucal_getMillis * @see ucal_setDate * @see ucal_setDateTime * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_setMillis(UCalendar* cal, UDate dateTime, UErrorCode* status ); /** * Set a UCalendar's current date. * The date is represented as a series of 32-bit integers. * @param cal The UCalendar to set. * @param year The desired year. * @param month The desired month; one of UCAL_JANUARY, UCAL_FEBRUARY, UCAL_MARCH, UCAL_APRIL, UCAL_MAY, * UCAL_JUNE, UCAL_JULY, UCAL_AUGUST, UCAL_SEPTEMBER, UCAL_OCTOBER, UCAL_NOVEMBER, UCAL_DECEMBER, UCAL_UNDECIMBER * @param date The desired day of the month. * @param status A pointer to an UErrorCode to receive any errors * @see ucal_getMillis * @see ucal_setMillis * @see ucal_setDateTime * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_setDate(UCalendar* cal, int32_t year, int32_t month, int32_t date, UErrorCode* status); /** * Set a UCalendar's current date. * The date is represented as a series of 32-bit integers. * @param cal The UCalendar to set. * @param year The desired year. * @param month The desired month; one of UCAL_JANUARY, UCAL_FEBRUARY, UCAL_MARCH, UCAL_APRIL, UCAL_MAY, * UCAL_JUNE, UCAL_JULY, UCAL_AUGUST, UCAL_SEPTEMBER, UCAL_OCTOBER, UCAL_NOVEMBER, UCAL_DECEMBER, UCAL_UNDECIMBER * @param date The desired day of the month. * @param hour The desired hour of day. * @param minute The desired minute. * @param second The desirec second. * @param status A pointer to an UErrorCode to receive any errors * @see ucal_getMillis * @see ucal_setMillis * @see ucal_setDate * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_setDateTime(UCalendar* cal, int32_t year, int32_t month, int32_t date, int32_t hour, int32_t minute, int32_t second, UErrorCode* status); /** * Returns true if two UCalendars are equivalent. Equivalent * UCalendars will behave identically, but they may be set to * different times. * @param cal1 The first of the UCalendars to compare. * @param cal2 The second of the UCalendars to compare. * @return true if cal1 and cal2 are equivalent, false otherwise. * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ucal_equivalentTo(const UCalendar* cal1, const UCalendar* cal2); /** * Add a specified signed amount to a particular field in a UCalendar. * This can modify more significant fields in the calendar. * Adding a positive value always means moving forward in time, so for the Gregorian calendar, * starting with 100 BC and adding +1 to year results in 99 BC (even though this actually reduces * the numeric value of the field itself). * @param cal The UCalendar to which to add. * @param field The field to which to add the signed value; one of UCAL_ERA, UCAL_YEAR, UCAL_MONTH, * UCAL_WEEK_OF_YEAR, UCAL_WEEK_OF_MONTH, UCAL_DATE, UCAL_DAY_OF_YEAR, UCAL_DAY_OF_WEEK, * UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_AM_PM, UCAL_HOUR, UCAL_HOUR_OF_DAY, UCAL_MINUTE, UCAL_SECOND, * UCAL_MILLISECOND, UCAL_ZONE_OFFSET, UCAL_DST_OFFSET. * @param amount The signed amount to add to field. If the amount causes the value * to exceed to maximum or minimum values for that field, other fields are modified * to preserve the magnitude of the change. * @param status A pointer to an UErrorCode to receive any errors * @see ucal_roll * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_add(UCalendar* cal, UCalendarDateFields field, int32_t amount, UErrorCode* status); /** * Add a specified signed amount to a particular field in a UCalendar. * This will not modify more significant fields in the calendar. * Rolling by a positive value always means moving forward in time (unless the limit of the * field is reached, in which case it may pin or wrap), so for Gregorian calendar, * starting with 100 BC and rolling the year by +1 results in 99 BC. * When eras have a definite beginning and end (as in the Chinese calendar, or as in most eras in the * Japanese calendar) then rolling the year past either limit of the era will cause the year to wrap around. * When eras only have a limit at one end, then attempting to roll the year past that limit will result in * pinning the year at that limit. Note that for most calendars in which era 0 years move forward in time * (such as Buddhist, Hebrew, or Islamic), it is possible for add or roll to result in negative years for * era 0 (that is the only way to represent years before the calendar epoch). * @param cal The UCalendar to which to add. * @param field The field to which to add the signed value; one of UCAL_ERA, UCAL_YEAR, UCAL_MONTH, * UCAL_WEEK_OF_YEAR, UCAL_WEEK_OF_MONTH, UCAL_DATE, UCAL_DAY_OF_YEAR, UCAL_DAY_OF_WEEK, * UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_AM_PM, UCAL_HOUR, UCAL_HOUR_OF_DAY, UCAL_MINUTE, UCAL_SECOND, * UCAL_MILLISECOND, UCAL_ZONE_OFFSET, UCAL_DST_OFFSET. * @param amount The signed amount to add to field. If the amount causes the value * to exceed to maximum or minimum values for that field, the field is pinned to a permissible * value. * @param status A pointer to an UErrorCode to receive any errors * @see ucal_add * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_roll(UCalendar* cal, UCalendarDateFields field, int32_t amount, UErrorCode* status); /** * Get the current value of a field from a UCalendar. * All fields are represented as 32-bit integers. * @param cal The UCalendar to query. * @param field The desired field; one of UCAL_ERA, UCAL_YEAR, UCAL_MONTH, * UCAL_WEEK_OF_YEAR, UCAL_WEEK_OF_MONTH, UCAL_DATE, UCAL_DAY_OF_YEAR, UCAL_DAY_OF_WEEK, * UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_AM_PM, UCAL_HOUR, UCAL_HOUR_OF_DAY, UCAL_MINUTE, UCAL_SECOND, * UCAL_MILLISECOND, UCAL_ZONE_OFFSET, UCAL_DST_OFFSET. * @param status A pointer to an UErrorCode to receive any errors * @return The value of the desired field. * @see ucal_set * @see ucal_isSet * @see ucal_clearField * @see ucal_clear * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucal_get(const UCalendar* cal, UCalendarDateFields field, UErrorCode* status ); /** * Set the value of a field in a UCalendar. * All fields are represented as 32-bit integers. * @param cal The UCalendar to set. * @param field The field to set; one of UCAL_ERA, UCAL_YEAR, UCAL_MONTH, * UCAL_WEEK_OF_YEAR, UCAL_WEEK_OF_MONTH, UCAL_DATE, UCAL_DAY_OF_YEAR, UCAL_DAY_OF_WEEK, * UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_AM_PM, UCAL_HOUR, UCAL_HOUR_OF_DAY, UCAL_MINUTE, UCAL_SECOND, * UCAL_MILLISECOND, UCAL_ZONE_OFFSET, UCAL_DST_OFFSET. * @param value The desired value of field. * @see ucal_get * @see ucal_isSet * @see ucal_clearField * @see ucal_clear * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_set(UCalendar* cal, UCalendarDateFields field, int32_t value); /** * Determine if a field in a UCalendar is set. * All fields are represented as 32-bit integers. * @param cal The UCalendar to query. * @param field The desired field; one of UCAL_ERA, UCAL_YEAR, UCAL_MONTH, * UCAL_WEEK_OF_YEAR, UCAL_WEEK_OF_MONTH, UCAL_DATE, UCAL_DAY_OF_YEAR, UCAL_DAY_OF_WEEK, * UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_AM_PM, UCAL_HOUR, UCAL_HOUR_OF_DAY, UCAL_MINUTE, UCAL_SECOND, * UCAL_MILLISECOND, UCAL_ZONE_OFFSET, UCAL_DST_OFFSET. * @return true if field is set, false otherwise. * @see ucal_get * @see ucal_set * @see ucal_clearField * @see ucal_clear * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ucal_isSet(const UCalendar* cal, UCalendarDateFields field); /** * Clear a field in a UCalendar. * All fields are represented as 32-bit integers. * @param cal The UCalendar containing the field to clear. * @param field The field to clear; one of UCAL_ERA, UCAL_YEAR, UCAL_MONTH, * UCAL_WEEK_OF_YEAR, UCAL_WEEK_OF_MONTH, UCAL_DATE, UCAL_DAY_OF_YEAR, UCAL_DAY_OF_WEEK, * UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_AM_PM, UCAL_HOUR, UCAL_HOUR_OF_DAY, UCAL_MINUTE, UCAL_SECOND, * UCAL_MILLISECOND, UCAL_ZONE_OFFSET, UCAL_DST_OFFSET. * @see ucal_get * @see ucal_set * @see ucal_isSet * @see ucal_clear * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_clearField(UCalendar* cal, UCalendarDateFields field); /** * Clear all fields in a UCalendar. * All fields are represented as 32-bit integers. * @param calendar The UCalendar to clear. * @see ucal_get * @see ucal_set * @see ucal_isSet * @see ucal_clearField * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucal_clear(UCalendar* calendar); /** * Possible limit values for a UCalendar * @stable ICU 2.0 */ enum UCalendarLimitType { /** Minimum value */ UCAL_MINIMUM, /** Maximum value */ UCAL_MAXIMUM, /** Greatest minimum value */ UCAL_GREATEST_MINIMUM, /** Leaest maximum value */ UCAL_LEAST_MAXIMUM, /** Actual minimum value */ UCAL_ACTUAL_MINIMUM, /** Actual maximum value */ UCAL_ACTUAL_MAXIMUM }; /** @stable ICU 2.0 */ typedef enum UCalendarLimitType UCalendarLimitType; /** * Determine a limit for a field in a UCalendar. * A limit is a maximum or minimum value for a field. * @param cal The UCalendar to query. * @param field The desired field; one of UCAL_ERA, UCAL_YEAR, UCAL_MONTH, * UCAL_WEEK_OF_YEAR, UCAL_WEEK_OF_MONTH, UCAL_DATE, UCAL_DAY_OF_YEAR, UCAL_DAY_OF_WEEK, * UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_AM_PM, UCAL_HOUR, UCAL_HOUR_OF_DAY, UCAL_MINUTE, UCAL_SECOND, * UCAL_MILLISECOND, UCAL_ZONE_OFFSET, UCAL_DST_OFFSET. * @param type The desired critical point; one of UCAL_MINIMUM, UCAL_MAXIMUM, UCAL_GREATEST_MINIMUM, * UCAL_LEAST_MAXIMUM, UCAL_ACTUAL_MINIMUM, UCAL_ACTUAL_MAXIMUM * @param status A pointer to an UErrorCode to receive any errors. * @return The requested value. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucal_getLimit(const UCalendar* cal, UCalendarDateFields field, UCalendarLimitType type, UErrorCode* status); /** Get the locale for this calendar object. You can choose between valid and actual locale. * @param cal The calendar object * @param type type of the locale we're looking for (valid or actual) * @param status error code for the operation * @return the locale name * @stable ICU 2.8 */ U_CAPI const char * U_EXPORT2 ucal_getLocaleByType(const UCalendar *cal, ULocDataLocaleType type, UErrorCode* status); /** * Returns the timezone data version currently used by ICU. * @param status error code for the operation * @return the version string, such as "2007f" * @stable ICU 3.8 */ U_CAPI const char * U_EXPORT2 ucal_getTZDataVersion(UErrorCode* status); /** * Returns the canonical system timezone ID or the normalized * custom time zone ID for the given time zone ID. * @param id The input timezone ID to be canonicalized. * @param len The length of id, or -1 if null-terminated. * @param result The buffer receives the canonical system timezone ID * or the custom timezone ID in normalized format. * @param resultCapacity The capacity of the result buffer. * @param isSystemID Receives if the given ID is a known system * timezone ID. * @param status Receives the status. When the given timezone ID * is neither a known system time zone ID nor a * valid custom timezone ID, U_ILLEGAL_ARGUMENT_ERROR * is set. * @return The result string length, not including the terminating * null. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 ucal_getCanonicalTimeZoneID(const UChar* id, int32_t len, UChar* result, int32_t resultCapacity, UBool *isSystemID, UErrorCode* status); /** * Get the resource keyword value string designating the calendar type for the UCalendar. * @param cal The UCalendar to query. * @param status The error code for the operation. * @return The resource keyword value string. * @stable ICU 4.2 */ U_CAPI const char * U_EXPORT2 ucal_getType(const UCalendar *cal, UErrorCode* status); /** * Given a key and a locale, returns an array of string values in a preferred * order that would make a difference. These are all and only those values where * the open (creation) of the service with the locale formed from the input locale * plus input keyword and that value has different behavior than creation with the * input locale alone. * @param key one of the keys supported by this service. For now, only * "calendar" is supported. * @param locale the locale * @param commonlyUsed if set to true it will return only commonly used values * with the given locale in preferred order. Otherwise, * it will return all the available values for the locale. * @param status error status * @return a string enumeration over keyword values for the given key and the locale. * @stable ICU 4.2 */ U_CAPI UEnumeration* U_EXPORT2 ucal_getKeywordValuesForLocale(const char* key, const char* locale, UBool commonlyUsed, UErrorCode* status); /** Weekday types, as returned by ucal_getDayOfWeekType(). * @stable ICU 4.4 */ enum UCalendarWeekdayType { /** * Designates a full weekday (no part of the day is included in the weekend). * @stable ICU 4.4 */ UCAL_WEEKDAY, /** * Designates a full weekend day (the entire day is included in the weekend). * @stable ICU 4.4 */ UCAL_WEEKEND, /** * Designates a day that starts as a weekday and transitions to the weekend. * Call ucal_getWeekendTransition() to get the time of transition. * @stable ICU 4.4 */ UCAL_WEEKEND_ONSET, /** * Designates a day that starts as the weekend and transitions to a weekday. * Call ucal_getWeekendTransition() to get the time of transition. * @stable ICU 4.4 */ UCAL_WEEKEND_CEASE }; /** @stable ICU 4.4 */ typedef enum UCalendarWeekdayType UCalendarWeekdayType; /** * Returns whether the given day of the week is a weekday, a weekend day, * or a day that transitions from one to the other, for the locale and * calendar system associated with this UCalendar (the locale's region is * often the most determinant factor). If a transition occurs at midnight, * then the days before and after the transition will have the * type UCAL_WEEKDAY or UCAL_WEEKEND. If a transition occurs at a time * other than midnight, then the day of the transition will have * the type UCAL_WEEKEND_ONSET or UCAL_WEEKEND_CEASE. In this case, the * function ucal_getWeekendTransition() will return the point of * transition. * @param cal The UCalendar to query. * @param dayOfWeek The day of the week whose type is desired (UCAL_SUNDAY..UCAL_SATURDAY). * @param status The error code for the operation. * @return The UCalendarWeekdayType for the day of the week. * @stable ICU 4.4 */ U_CAPI UCalendarWeekdayType U_EXPORT2 ucal_getDayOfWeekType(const UCalendar *cal, UCalendarDaysOfWeek dayOfWeek, UErrorCode* status); /** * Returns the time during the day at which the weekend begins or ends in * this calendar system. If ucal_getDayOfWeekType() returns UCAL_WEEKEND_ONSET * for the specified dayOfWeek, return the time at which the weekend begins. * If ucal_getDayOfWeekType() returns UCAL_WEEKEND_CEASE for the specified dayOfWeek, * return the time at which the weekend ends. If ucal_getDayOfWeekType() returns * some other UCalendarWeekdayType for the specified dayOfWeek, is it an error condition * (U_ILLEGAL_ARGUMENT_ERROR). * @param cal The UCalendar to query. * @param dayOfWeek The day of the week for which the weekend transition time is * desired (UCAL_SUNDAY..UCAL_SATURDAY). * @param status The error code for the operation. * @return The milliseconds after midnight at which the weekend begins or ends. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 ucal_getWeekendTransition(const UCalendar *cal, UCalendarDaysOfWeek dayOfWeek, UErrorCode *status); /** * Returns true if the given UDate is in the weekend in * this calendar system. * @param cal The UCalendar to query. * @param date The UDate in question. * @param status The error code for the operation. * @return true if the given UDate is in the weekend in * this calendar system, false otherwise. * @stable ICU 4.4 */ U_CAPI UBool U_EXPORT2 ucal_isWeekend(const UCalendar *cal, UDate date, UErrorCode *status); /** * Return the difference between the target time and the time this calendar object is currently set to. * If the target time is after the current calendar setting, the the returned value will be positive. * The field parameter specifies the units of the return value. For example, if field is UCAL_MONTH * and ucal_getFieldDifference returns 3, then the target time is 3 to less than 4 months after the * current calendar setting. * * As a side effect of this call, this calendar is advanced toward target by the given amount. That is, * calling this function has the side effect of calling ucal_add on this calendar with the specified * field and an amount equal to the return value from this function. * * A typical way of using this function is to call it first with the largest field of interest, then * with progressively smaller fields. * * @param cal The UCalendar to compare and update. * @param target The target date to compare to the current calendar setting. * @param field The field to compare; one of UCAL_ERA, UCAL_YEAR, UCAL_MONTH, * UCAL_WEEK_OF_YEAR, UCAL_WEEK_OF_MONTH, UCAL_DATE, UCAL_DAY_OF_YEAR, UCAL_DAY_OF_WEEK, * UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_AM_PM, UCAL_HOUR, UCAL_HOUR_OF_DAY, UCAL_MINUTE, UCAL_SECOND, * UCAL_MILLISECOND, UCAL_ZONE_OFFSET, UCAL_DST_OFFSET. * @param status A pointer to an UErrorCode to receive any errors * @return The date difference for the specified field. * @stable ICU 4.8 */ U_CAPI int32_t U_EXPORT2 ucal_getFieldDifference(UCalendar* cal, UDate target, UCalendarDateFields field, UErrorCode* status); /** * Time zone transition types for ucal_getTimeZoneTransitionDate * @stable ICU 50 */ enum UTimeZoneTransitionType { /** * Get the next transition after the current date, * i.e. excludes the current date * @stable ICU 50 */ UCAL_TZ_TRANSITION_NEXT, /** * Get the next transition on or after the current date, * i.e. may include the current date * @stable ICU 50 */ UCAL_TZ_TRANSITION_NEXT_INCLUSIVE, /** * Get the previous transition before the current date, * i.e. excludes the current date * @stable ICU 50 */ UCAL_TZ_TRANSITION_PREVIOUS, /** * Get the previous transition on or before the current date, * i.e. may include the current date * @stable ICU 50 */ UCAL_TZ_TRANSITION_PREVIOUS_INCLUSIVE }; typedef enum UTimeZoneTransitionType UTimeZoneTransitionType; /**< @stable ICU 50 */ /** * Get the UDate for the next/previous time zone transition relative to * the calendar's current date, in the time zone to which the calendar * is currently set. If there is no known time zone transition of the * requested type relative to the calendar's date, the function returns * false. * @param cal The UCalendar to query. * @param type The type of transition desired. * @param transition A pointer to a UDate to be set to the transition time. * If the function returns false, the value set is unspecified. * @param status A pointer to a UErrorCode to receive any errors. * @return true if a valid transition time is set in *transition, false * otherwise. * @stable ICU 50 */ U_CAPI UBool U_EXPORT2 ucal_getTimeZoneTransitionDate(const UCalendar* cal, UTimeZoneTransitionType type, UDate* transition, UErrorCode* status); /** * Converts a system time zone ID to an equivalent Windows time zone ID. For example, * Windows time zone ID "Pacific Standard Time" is returned for input "America/Los_Angeles". * *

There are system time zones that cannot be mapped to Windows zones. When the input * system time zone ID is unknown or unmappable to a Windows time zone, then this * function returns 0 as the result length, but the operation itself remains successful * (no error status set on return). * *

This implementation utilizes * Zone-Tzid mapping data. The mapping data is updated time to time. To get the latest changes, * please read the ICU user guide section * Updating the Time Zone Data. * * @param id A system time zone ID. * @param len The length of id, or -1 if null-terminated. * @param winid A buffer to receive a Windows time zone ID. * @param winidCapacity The capacity of the result buffer winid. * @param status Receives the status. * @return The result string length, not including the terminating null. * @see ucal_getTimeZoneIDForWindowsID * * @stable ICU 52 */ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len, UChar* winid, int32_t winidCapacity, UErrorCode* status); /** * Converts a Windows time zone ID to an equivalent system time zone ID * for a region. For example, system time zone ID "America/Los_Angeles" is returned * for input Windows ID "Pacific Standard Time" and region "US" (or null), * "America/Vancouver" is returned for the same Windows ID "Pacific Standard Time" and * region "CA". * *

Not all Windows time zones can be mapped to system time zones. When the input * Windows time zone ID is unknown or unmappable to a system time zone, then this * function returns 0 as the result length, but the operation itself remains successful * (no error status set on return). * *

This implementation utilizes * Zone-Tzid mapping data. The mapping data is updated time to time. To get the latest changes, * please read the ICU user guide section * Updating the Time Zone Data. * * @param winid A Windows time zone ID. * @param len The length of winid, or -1 if null-terminated. * @param region A null-terminated region code, or NULL if no regional preference. * @param id A buffer to receive a system time zone ID. * @param idCapacity The capacity of the result buffer id. * @param status Receives the status. * @return The result string length, not including the terminating null. * @see ucal_getWindowsTimeZoneID * * @stable ICU 52 */ U_CAPI int32_t U_EXPORT2 ucal_getTimeZoneIDForWindowsID(const UChar* winid, int32_t len, const char* region, UChar* id, int32_t idCapacity, UErrorCode* status); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Options used by ucal_getTimeZoneOffsetFromLocal and BasicTimeZone::getOffsetFromLocal() * to specify how to interpret an input time when it does not exist, or when it is ambiguous, * around a time zone transition. * @stable ICU 69 */ enum UTimeZoneLocalOption { /** * An input time is always interpreted as local time before * a time zone transition. * @stable ICU 69 */ UCAL_TZ_LOCAL_FORMER = 0x04, /** * An input time is always interpreted as local time after * a time zone transition. * @stable ICU 69 */ UCAL_TZ_LOCAL_LATTER = 0x0C, /** * An input time is interpreted as standard time when local * time is switched to/from daylight saving time. When both * sides of a time zone transition are standard time, * or daylight saving time, the local time before the * transition is used. * @stable ICU 69 */ UCAL_TZ_LOCAL_STANDARD_FORMER = UCAL_TZ_LOCAL_FORMER | 0x01, /** * An input time is interpreted as standard time when local * time is switched to/from daylight saving time. When both * sides of a time zone transition are standard time, * or daylight saving time, the local time after the * transition is used. * @stable ICU 69 */ UCAL_TZ_LOCAL_STANDARD_LATTER = UCAL_TZ_LOCAL_LATTER | 0x01, /** * An input time is interpreted as daylight saving time when * local time is switched to/from standard time. When both * sides of a time zone transition are standard time, * or daylight saving time, the local time before the * transition is used. * @stable ICU 69 */ UCAL_TZ_LOCAL_DAYLIGHT_FORMER = UCAL_TZ_LOCAL_FORMER | 0x03, /** * An input time is interpreted as daylight saving time when * local time is switched to/from standard time. When both * sides of a time zone transition are standard time, * or daylight saving time, the local time after the * transition is used. * @stable ICU 69 */ UCAL_TZ_LOCAL_DAYLIGHT_LATTER = UCAL_TZ_LOCAL_LATTER | 0x03, }; typedef enum UTimeZoneLocalOption UTimeZoneLocalOption; /**< @stable ICU 69 */ /** * Returns the time zone raw and GMT offset for the given moment * in time. Upon return, local-millis = GMT-millis + rawOffset + * dstOffset. All computations are performed in the proleptic * Gregorian calendar. * * @param cal The UCalendar which specify the local date and time value to query. * @param nonExistingTimeOpt The option to indicate how to interpret the date and * time in the calendar represent a local time that skipped at a positive time * zone transitions (e.g. when the daylight saving time starts or the time zone * offset is increased due to a time zone rule change). * @param duplicatedTimeOpt The option to indicate how to interpret the date and * time in the calendar represent a local time that repeating multiple times at a * negative time zone transition (e.g. when the daylight saving time ends or the * time zone offset is decreased due to a time zone rule change) * @param rawOffset output parameter to receive the raw offset, that * is, the offset not including DST adjustments. * If the status is set to one of the error code, the value set is unspecified. * @param dstOffset output parameter to receive the DST offset, * that is, the offset to be added to `rawOffset' to obtain the * total offset between local and GMT time. If DST is not in * effect, this value is zero; otherwise it is a positive value, * typically one hour. * If the status is set to one of the error code, the value set is unspecified. * @param status A pointer to a UErrorCode to receive any errors. * @stable ICU 69 */ U_CAPI void U_EXPORT2 ucal_getTimeZoneOffsetFromLocal( const UCalendar* cal, UTimeZoneLocalOption nonExistingTimeOpt, UTimeZoneLocalOption duplicatedTimeOpt, int32_t* rawOffset, int32_t* dstOffset, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // ucol.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (c) 1996-2015, International Business Machines Corporation and others. * All Rights Reserved. ******************************************************************************* */ #ifndef UCOL_H #define UCOL_H #if !UCONFIG_NO_COLLATION /** * \file * \brief C API: Collator * *

Collator C API

* * The C API for Collator performs locale-sensitive * string comparison. You use this service to build * searching and sorting routines for natural language text. *

* For more information about the collation service see * the User Guide. *

* Collation service provides correct sorting orders for most locales supported in ICU. * If specific data for a locale is not available, the orders eventually falls back * to the CLDR root sort order. *

* Sort ordering may be customized by providing your own set of rules. For more on * this subject see the * Collation Customization section of the User Guide. *

* @see UCollationResult * @see UNormalizationMode * @see UCollationStrength * @see UCollationElements */ /** A collator. * For usage in C programs. */ struct UCollator; /** structure representing a collator object instance * @stable ICU 2.0 */ typedef struct UCollator UCollator; /** * UCOL_LESS is returned if source string is compared to be less than target * string in the ucol_strcoll() method. * UCOL_EQUAL is returned if source string is compared to be equal to target * string in the ucol_strcoll() method. * UCOL_GREATER is returned if source string is compared to be greater than * target string in the ucol_strcoll() method. * @see ucol_strcoll() *

* Possible values for a comparison result * @stable ICU 2.0 */ typedef enum { /** string a == string b */ UCOL_EQUAL = 0, /** string a > string b */ UCOL_GREATER = 1, /** string a < string b */ UCOL_LESS = -1 } UCollationResult ; /** Enum containing attribute values for controling collation behavior. * Here are all the allowable values. Not every attribute can take every value. The only * universal value is UCOL_DEFAULT, which resets the attribute value to the predefined * value for that locale * @stable ICU 2.0 */ typedef enum { /** accepted by most attributes */ UCOL_DEFAULT = -1, /** Primary collation strength */ UCOL_PRIMARY = 0, /** Secondary collation strength */ UCOL_SECONDARY = 1, /** Tertiary collation strength */ UCOL_TERTIARY = 2, /** Default collation strength */ UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY, UCOL_CE_STRENGTH_LIMIT, /** Quaternary collation strength */ UCOL_QUATERNARY=3, /** Identical collation strength */ UCOL_IDENTICAL=15, UCOL_STRENGTH_LIMIT, /** Turn the feature off - works for UCOL_FRENCH_COLLATION, UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE & UCOL_DECOMPOSITION_MODE*/ UCOL_OFF = 16, /** Turn the feature on - works for UCOL_FRENCH_COLLATION, UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE & UCOL_DECOMPOSITION_MODE*/ UCOL_ON = 17, /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be shifted */ UCOL_SHIFTED = 20, /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be non ignorable */ UCOL_NON_IGNORABLE = 21, /** Valid for UCOL_CASE_FIRST - lower case sorts before upper case */ UCOL_LOWER_FIRST = 24, /** upper case sorts before lower case */ UCOL_UPPER_FIRST = 25, } UColAttributeValue; /** * Enum containing the codes for reordering segments of the collation table that are not script * codes. These reordering codes are to be used in conjunction with the script codes. * @see ucol_getReorderCodes * @see ucol_setReorderCodes * @see ucol_getEquivalentReorderCodes * @see UScriptCode * @stable ICU 4.8 */ typedef enum { /** * A special reordering code that is used to specify the default * reordering codes for a locale. * @stable ICU 4.8 */ UCOL_REORDER_CODE_DEFAULT = -1, /** * A special reordering code that is used to specify no reordering codes. * @stable ICU 4.8 */ UCOL_REORDER_CODE_NONE = USCRIPT_UNKNOWN, /** * A special reordering code that is used to specify all other codes used for * reordering except for the codes lised as UColReorderCode values and those * listed explicitly in a reordering. * @stable ICU 4.8 */ UCOL_REORDER_CODE_OTHERS = USCRIPT_UNKNOWN, /** * Characters with the space property. * This is equivalent to the rule value "space". * @stable ICU 4.8 */ UCOL_REORDER_CODE_SPACE = 0x1000, /** * The first entry in the enumeration of reordering groups. This is intended for use in * range checking and enumeration of the reorder codes. * @stable ICU 4.8 */ UCOL_REORDER_CODE_FIRST = UCOL_REORDER_CODE_SPACE, /** * Characters with the punctuation property. * This is equivalent to the rule value "punct". * @stable ICU 4.8 */ UCOL_REORDER_CODE_PUNCTUATION = 0x1001, /** * Characters with the symbol property. * This is equivalent to the rule value "symbol". * @stable ICU 4.8 */ UCOL_REORDER_CODE_SYMBOL = 0x1002, /** * Characters with the currency property. * This is equivalent to the rule value "currency". * @stable ICU 4.8 */ UCOL_REORDER_CODE_CURRENCY = 0x1003, /** * Characters with the digit property. * This is equivalent to the rule value "digit". * @stable ICU 4.8 */ UCOL_REORDER_CODE_DIGIT = 0x1004, } UColReorderCode; /** * Base letter represents a primary difference. Set comparison * level to UCOL_PRIMARY to ignore secondary and tertiary differences. * Use this to set the strength of a Collator object. * Example of primary difference, "abc" < "abd" * * Diacritical differences on the same base letter represent a secondary * difference. Set comparison level to UCOL_SECONDARY to ignore tertiary * differences. Use this to set the strength of a Collator object. * Example of secondary difference, "ä" >> "a". * * Uppercase and lowercase versions of the same character represents a * tertiary difference. Set comparison level to UCOL_TERTIARY to include * all comparison differences. Use this to set the strength of a Collator * object. * Example of tertiary difference, "abc" <<< "ABC". * * Two characters are considered "identical" when they have the same * unicode spellings. UCOL_IDENTICAL. * For example, "ä" == "ä". * * UCollationStrength is also used to determine the strength of sort keys * generated from UCollator objects * These values can be now found in the UColAttributeValue enum. * @stable ICU 2.0 **/ typedef UColAttributeValue UCollationStrength; /** Attributes that collation service understands. All the attributes can take UCOL_DEFAULT * value, as well as the values specific to each one. * @stable ICU 2.0 */ typedef enum { /** Attribute for direction of secondary weights - used in Canadian French. * Acceptable values are UCOL_ON, which results in secondary weights * being considered backwards and UCOL_OFF which treats secondary * weights in the order they appear. * @stable ICU 2.0 */ UCOL_FRENCH_COLLATION, /** Attribute for handling variable elements. * Acceptable values are UCOL_NON_IGNORABLE (default) * which treats all the codepoints with non-ignorable * primary weights in the same way, * and UCOL_SHIFTED which causes codepoints with primary * weights that are equal or below the variable top value * to be ignored on primary level and moved to the quaternary * level. * @stable ICU 2.0 */ UCOL_ALTERNATE_HANDLING, /** Controls the ordering of upper and lower case letters. * Acceptable values are UCOL_OFF (default), which orders * upper and lower case letters in accordance to their tertiary * weights, UCOL_UPPER_FIRST which forces upper case letters to * sort before lower case letters, and UCOL_LOWER_FIRST which does * the opposite. * @stable ICU 2.0 */ UCOL_CASE_FIRST, /** Controls whether an extra case level (positioned before the third * level) is generated or not. Acceptable values are UCOL_OFF (default), * when case level is not generated, and UCOL_ON which causes the case * level to be generated. Contents of the case level are affected by * the value of UCOL_CASE_FIRST attribute. A simple way to ignore * accent differences in a string is to set the strength to UCOL_PRIMARY * and enable case level. * @stable ICU 2.0 */ UCOL_CASE_LEVEL, /** Controls whether the normalization check and necessary normalizations * are performed. When set to UCOL_OFF (default) no normalization check * is performed. The correctness of the result is guaranteed only if the * input data is in so-called FCD form (see users manual for more info). * When set to UCOL_ON, an incremental check is performed to see whether * the input data is in the FCD form. If the data is not in the FCD form, * incremental NFD normalization is performed. * @stable ICU 2.0 */ UCOL_NORMALIZATION_MODE, /** An alias for UCOL_NORMALIZATION_MODE attribute. * @stable ICU 2.0 */ UCOL_DECOMPOSITION_MODE = UCOL_NORMALIZATION_MODE, /** The strength attribute. Can be either UCOL_PRIMARY, UCOL_SECONDARY, * UCOL_TERTIARY, UCOL_QUATERNARY or UCOL_IDENTICAL. The usual strength * for most locales (except Japanese) is tertiary. * * Quaternary strength * is useful when combined with shifted setting for alternate handling * attribute and for JIS X 4061 collation, when it is used to distinguish * between Katakana and Hiragana. * Otherwise, quaternary level * is affected only by the number of non-ignorable code points in * the string. * * Identical strength is rarely useful, as it amounts * to codepoints of the NFD form of the string. * @stable ICU 2.0 */ UCOL_STRENGTH, /** * When turned on, this attribute makes * substrings of digits sort according to their numeric values. * * This is a way to get '100' to sort AFTER '2'. Note that the longest * digit substring that can be treated as a single unit is * 254 digits (not counting leading zeros). If a digit substring is * longer than that, the digits beyond the limit will be treated as a * separate digit substring. * * A "digit" in this sense is a code point with General_Category=Nd, * which does not include circled numbers, roman numerals, etc. * Only a contiguous digit substring is considered, that is, * non-negative integers without separators. * There is no support for plus/minus signs, decimals, exponents, etc. * * @stable ICU 2.8 */ UCOL_NUMERIC_COLLATION = UCOL_STRENGTH + 2, /* Do not conditionalize the following with #ifndef U_HIDE_DEPRECATED_API, * it is needed for layout of RuleBasedCollator object. */ #ifndef U_FORCE_HIDE_DEPRECATED_API /** * One more than the highest normal UColAttribute value. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ UCOL_ATTRIBUTE_COUNT #endif // U_FORCE_HIDE_DEPRECATED_API } UColAttribute; /** Options for retrieving the rule string * @stable ICU 2.0 */ typedef enum { /** * Retrieves the tailoring rules only. * Same as calling the version of getRules() without UColRuleOption. * @stable ICU 2.0 */ UCOL_TAILORING_ONLY, /** * Retrieves the "UCA rules" concatenated with the tailoring rules. * The "UCA rules" are an approximation of the root collator's sort order. * They are almost never used or useful at runtime and can be removed from the data. * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales * @stable ICU 2.0 */ UCOL_FULL_RULES } UColRuleOption ; /** * Open a UCollator for comparing strings. * * For some languages, multiple collation types are available; * for example, "de@collation=phonebook". * Starting with ICU 54, collation attributes can be specified via locale keywords as well, * in the old locale extension syntax ("el@colCaseFirst=upper") * or in language tag syntax ("el-u-kf-upper"). * See User Guide: Collation API. * * The UCollator pointer is used in all the calls to the Collation * service. After finished, collator must be disposed of by calling * {@link #ucol_close }. * @param loc The locale containing the required collation rules. * Special values for locales can be passed in - * if NULL is passed for the locale, the default locale * collation rules will be used. If empty string ("") or * "root" are passed, the root collator will be returned. * @param status A pointer to a UErrorCode to receive any errors * @return A pointer to a UCollator, or 0 if an error occurred. * @see ucol_openRules * @see ucol_safeClone * @see ucol_close * @stable ICU 2.0 */ U_CAPI UCollator* U_EXPORT2 ucol_open(const char *loc, UErrorCode *status); /** * Produce a UCollator instance according to the rules supplied. * The rules are used to change the default ordering, defined in the * UCA in a process called tailoring. The resulting UCollator pointer * can be used in the same way as the one obtained by {@link #ucol_strcoll }. * @param rules A string describing the collation rules. For the syntax * of the rules please see users guide. * @param rulesLength The length of rules, or -1 if null-terminated. * @param normalizationMode The normalization mode: One of * UCOL_OFF (expect the text to not need normalization), * UCOL_ON (normalize), or * UCOL_DEFAULT (set the mode according to the rules) * @param strength The default collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY, * UCOL_TERTIARY, UCOL_IDENTICAL,UCOL_DEFAULT_STRENGTH - can be also set in the rules. * @param parseError A pointer to UParseError to recieve information about errors * occurred during parsing. This argument can currently be set * to NULL, but at users own risk. Please provide a real structure. * @param status A pointer to a UErrorCode to receive any errors * @return A pointer to a UCollator. It is not guaranteed that NULL be returned in case * of error - please use status argument to check for errors. * @see ucol_open * @see ucol_safeClone * @see ucol_close * @stable ICU 2.0 */ U_CAPI UCollator* U_EXPORT2 ucol_openRules( const UChar *rules, int32_t rulesLength, UColAttributeValue normalizationMode, UCollationStrength strength, UParseError *parseError, UErrorCode *status); /** * Get a set containing the expansions defined by the collator. The set includes * both the root collator's expansions and the expansions defined by the tailoring * @param coll collator * @param contractions if not NULL, the set to hold the contractions * @param expansions if not NULL, the set to hold the expansions * @param addPrefixes add the prefix contextual elements to contractions * @param status to hold the error code * * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ucol_getContractionsAndExpansions( const UCollator *coll, USet *contractions, USet *expansions, UBool addPrefixes, UErrorCode *status); /** * Close a UCollator. * Once closed, a UCollator should not be used. Every open collator should * be closed. Otherwise, a memory leak will result. * @param coll The UCollator to close. * @see ucol_open * @see ucol_openRules * @see ucol_safeClone * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucol_close(UCollator *coll); /** * Compare two strings. * The strings will be compared using the options already specified. * @param coll The UCollator containing the comparison rules. * @param source The source string. * @param sourceLength The length of source, or -1 if null-terminated. * @param target The target string. * @param targetLength The length of target, or -1 if null-terminated. * @return The result of comparing the strings; one of UCOL_EQUAL, * UCOL_GREATER, UCOL_LESS * @see ucol_greater * @see ucol_greaterOrEqual * @see ucol_equal * @stable ICU 2.0 */ U_CAPI UCollationResult U_EXPORT2 ucol_strcoll( const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, int32_t targetLength); /** * Compare two strings in UTF-8. * The strings will be compared using the options already specified. * Note: When input string contains malformed a UTF-8 byte sequence, * this function treats these bytes as REPLACEMENT CHARACTER (U+FFFD). * @param coll The UCollator containing the comparison rules. * @param source The source UTF-8 string. * @param sourceLength The length of source, or -1 if null-terminated. * @param target The target UTF-8 string. * @param targetLength The length of target, or -1 if null-terminated. * @param status A pointer to a UErrorCode to receive any errors * @return The result of comparing the strings; one of UCOL_EQUAL, * UCOL_GREATER, UCOL_LESS * @see ucol_greater * @see ucol_greaterOrEqual * @see ucol_equal * @stable ICU 50 */ U_CAPI UCollationResult U_EXPORT2 ucol_strcollUTF8( const UCollator *coll, const char *source, int32_t sourceLength, const char *target, int32_t targetLength, UErrorCode *status); /** * Determine if one string is greater than another. * This function is equivalent to {@link #ucol_strcoll } == UCOL_GREATER * @param coll The UCollator containing the comparison rules. * @param source The source string. * @param sourceLength The length of source, or -1 if null-terminated. * @param target The target string. * @param targetLength The length of target, or -1 if null-terminated. * @return true if source is greater than target, false otherwise. * @see ucol_strcoll * @see ucol_greaterOrEqual * @see ucol_equal * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ucol_greater(const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, int32_t targetLength); /** * Determine if one string is greater than or equal to another. * This function is equivalent to {@link #ucol_strcoll } != UCOL_LESS * @param coll The UCollator containing the comparison rules. * @param source The source string. * @param sourceLength The length of source, or -1 if null-terminated. * @param target The target string. * @param targetLength The length of target, or -1 if null-terminated. * @return true if source is greater than or equal to target, false otherwise. * @see ucol_strcoll * @see ucol_greater * @see ucol_equal * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ucol_greaterOrEqual(const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, int32_t targetLength); /** * Compare two strings for equality. * This function is equivalent to {@link #ucol_strcoll } == UCOL_EQUAL * @param coll The UCollator containing the comparison rules. * @param source The source string. * @param sourceLength The length of source, or -1 if null-terminated. * @param target The target string. * @param targetLength The length of target, or -1 if null-terminated. * @return true if source is equal to target, false otherwise * @see ucol_strcoll * @see ucol_greater * @see ucol_greaterOrEqual * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 ucol_equal(const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, int32_t targetLength); /** * Compare two UTF-8 encoded strings. * The strings will be compared using the options already specified. * @param coll The UCollator containing the comparison rules. * @param sIter The source string iterator. * @param tIter The target string iterator. * @return The result of comparing the strings; one of UCOL_EQUAL, * UCOL_GREATER, UCOL_LESS * @param status A pointer to a UErrorCode to receive any errors * @see ucol_strcoll * @stable ICU 2.6 */ U_CAPI UCollationResult U_EXPORT2 ucol_strcollIter( const UCollator *coll, UCharIterator *sIter, UCharIterator *tIter, UErrorCode *status); /** * Get the collation strength used in a UCollator. * The strength influences how strings are compared. * @param coll The UCollator to query. * @return The collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY, * UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL * @see ucol_setStrength * @stable ICU 2.0 */ U_CAPI UCollationStrength U_EXPORT2 ucol_getStrength(const UCollator *coll); /** * Set the collation strength used in a UCollator. * The strength influences how strings are compared. * @param coll The UCollator to set. * @param strength The desired collation strength; one of UCOL_PRIMARY, * UCOL_SECONDARY, UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL, UCOL_DEFAULT * @see ucol_getStrength * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucol_setStrength(UCollator *coll, UCollationStrength strength); /** * Retrieves the reordering codes for this collator. * These reordering codes are a combination of UScript codes and UColReorderCode entries. * @param coll The UCollator to query. * @param dest The array to fill with the script ordering. * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function * will only return the length of the result without writing any codes (pre-flighting). * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a * failure before the function call. * @return The number of reordering codes written to the dest array. * @see ucol_setReorderCodes * @see ucol_getEquivalentReorderCodes * @see UScriptCode * @see UColReorderCode * @stable ICU 4.8 */ U_CAPI int32_t U_EXPORT2 ucol_getReorderCodes(const UCollator* coll, int32_t* dest, int32_t destCapacity, UErrorCode *pErrorCode); /** * Sets the reordering codes for this collator. * Collation reordering allows scripts and some other groups of characters * to be moved relative to each other. This reordering is done on top of * the DUCET/CLDR standard collation order. Reordering can specify groups to be placed * at the start and/or the end of the collation order. These groups are specified using * UScript codes and UColReorderCode entries. * *

By default, reordering codes specified for the start of the order are placed in the * order given after several special non-script blocks. These special groups of characters * are space, punctuation, symbol, currency, and digit. These special groups are represented with * UColReorderCode entries. Script groups can be intermingled with * these special non-script groups if those special groups are explicitly specified in the reordering. * *

The special code OTHERS stands for any script that is not explicitly * mentioned in the list of reordering codes given. Anything that is after OTHERS * will go at the very end of the reordering in the order given. * *

The special reorder code DEFAULT will reset the reordering for this collator * to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that * was specified when this collator was created from resource data or from rules. The * DEFAULT code must be the sole code supplied when it is used. * If not, then U_ILLEGAL_ARGUMENT_ERROR will be set. * *

The special reorder code NONE will remove any reordering for this collator. * The result of setting no reordering will be to have the DUCET/CLDR ordering used. The * NONE code must be the sole code supplied when it is used. * * @param coll The UCollator to set. * @param reorderCodes An array of script codes in the new order. This can be NULL if the * length is also set to 0. An empty array will clear any reordering codes on the collator. * @param reorderCodesLength The length of reorderCodes. * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a * failure before the function call. * @see ucol_getReorderCodes * @see ucol_getEquivalentReorderCodes * @see UScriptCode * @see UColReorderCode * @stable ICU 4.8 */ U_CAPI void U_EXPORT2 ucol_setReorderCodes(UCollator* coll, const int32_t* reorderCodes, int32_t reorderCodesLength, UErrorCode *pErrorCode); /** * Retrieves the reorder codes that are grouped with the given reorder code. Some reorder * codes will be grouped and must reorder together. * Beginning with ICU 55, scripts only reorder together if they are primary-equal, * for example Hiragana and Katakana. * * @param reorderCode The reorder code to determine equivalence for. * @param dest The array to fill with the script ordering. * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function * will only return the length of the result without writing any codes (pre-flighting). * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate * a failure before the function call. * @return The number of reordering codes written to the dest array. * @see ucol_setReorderCodes * @see ucol_getReorderCodes * @see UScriptCode * @see UColReorderCode * @stable ICU 4.8 */ U_CAPI int32_t U_EXPORT2 ucol_getEquivalentReorderCodes(int32_t reorderCode, int32_t* dest, int32_t destCapacity, UErrorCode *pErrorCode); /** * Get the display name for a UCollator. * The display name is suitable for presentation to a user. * @param objLoc The locale of the collator in question. * @param dispLoc The locale for display. * @param result A pointer to a buffer to receive the attribute. * @param resultLength The maximum size of result. * @param status A pointer to a UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, * the output was truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_getDisplayName( const char *objLoc, const char *dispLoc, UChar *result, int32_t resultLength, UErrorCode *status); /** * Get a locale for which collation rules are available. * A UCollator in a locale returned by this function will perform the correct * collation for the locale. * @param localeIndex The index of the desired locale. * @return A locale for which collation rules are available, or 0 if none. * @see ucol_countAvailable * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 ucol_getAvailable(int32_t localeIndex); /** * Determine how many locales have collation rules available. * This function is most useful as determining the loop ending condition for * calls to {@link #ucol_getAvailable }. * @return The number of locales for which collation rules are available. * @see ucol_getAvailable * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_countAvailable(void); #if !UCONFIG_NO_SERVICE /** * Create a string enumerator of all locales for which a valid * collator may be opened. * @param status input-output error code * @return a string enumeration over locale strings. The caller is * responsible for closing the result. * @stable ICU 3.0 */ U_CAPI UEnumeration* U_EXPORT2 ucol_openAvailableLocales(UErrorCode *status); #endif /** * Create a string enumerator of all possible keywords that are relevant to * collation. At this point, the only recognized keyword for this * service is "collation". * @param status input-output error code * @return a string enumeration over locale strings. The caller is * responsible for closing the result. * @stable ICU 3.0 */ U_CAPI UEnumeration* U_EXPORT2 ucol_getKeywords(UErrorCode *status); /** * Given a keyword, create a string enumeration of all values * for that keyword that are currently in use. * @param keyword a particular keyword as enumerated by * ucol_getKeywords. If any other keyword is passed in, *status is set * to U_ILLEGAL_ARGUMENT_ERROR. * @param status input-output error code * @return a string enumeration over collation keyword values, or NULL * upon error. The caller is responsible for closing the result. * @stable ICU 3.0 */ U_CAPI UEnumeration* U_EXPORT2 ucol_getKeywordValues(const char *keyword, UErrorCode *status); /** * Given a key and a locale, returns an array of string values in a preferred * order that would make a difference. These are all and only those values where * the open (creation) of the service with the locale formed from the input locale * plus input keyword and that value has different behavior than creation with the * input locale alone. * @param key one of the keys supported by this service. For now, only * "collation" is supported. * @param locale the locale * @param commonlyUsed if set to true it will return only commonly used values * with the given locale in preferred order. Otherwise, * it will return all the available values for the locale. * @param status error status * @return a string enumeration over keyword values for the given key and the locale. * @stable ICU 4.2 */ U_CAPI UEnumeration* U_EXPORT2 ucol_getKeywordValuesForLocale(const char* key, const char* locale, UBool commonlyUsed, UErrorCode* status); /** * Return the functionally equivalent locale for the specified * input locale, with respect to given keyword, for the * collation service. If two different input locale + keyword * combinations produce the same result locale, then collators * instantiated for these two different input locales will behave * equivalently. The converse is not always true; two collators * may in fact be equivalent, but return different results, due to * internal details. The return result has no other meaning than * that stated above, and implies nothing as to the relationship * between the two locales. This is intended for use by * applications who wish to cache collators, or otherwise reuse * collators when possible. The functional equivalent may change * over time. For more information, please see the * Locales and Services section of the ICU User Guide. * @param result fillin for the functionally equivalent result locale * @param resultCapacity capacity of the fillin buffer * @param keyword a particular keyword as enumerated by * ucol_getKeywords. * @param locale the specified input locale * @param isAvailable if non-NULL, pointer to a fillin parameter that * on return indicates whether the specified input locale was 'available' * to the collation service. A locale is defined as 'available' if it * physically exists within the collation locale data. * @param status pointer to input-output error code * @return the actual buffer size needed for the locale. If greater * than resultCapacity, the returned full name will be truncated and * an error code will be returned. * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity, const char* keyword, const char* locale, UBool* isAvailable, UErrorCode* status); /** * Get the collation tailoring rules from a UCollator. * The rules will follow the rule syntax. * @param coll The UCollator to query. * @param length * @return The collation tailoring rules. * @stable ICU 2.0 */ U_CAPI const UChar* U_EXPORT2 ucol_getRules( const UCollator *coll, int32_t *length); /** * Get a sort key for a string from a UCollator. * Sort keys may be compared using strcmp. * * Note that sort keys are often less efficient than simply doing comparison. * For more details, see the ICU User Guide. * * Like ICU functions that write to an output buffer, the buffer contents * is undefined if the buffer capacity (resultLength parameter) is too small. * Unlike ICU functions that write a string to an output buffer, * the terminating zero byte is counted in the sort key length. * @param coll The UCollator containing the collation rules. * @param source The string to transform. * @param sourceLength The length of source, or -1 if null-terminated. * @param result A pointer to a buffer to receive the attribute. * @param resultLength The maximum size of result. * @return The size needed to fully store the sort key. * If there was an internal error generating the sort key, * a zero value is returned. * @see ucol_keyHashCode * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_getSortKey(const UCollator *coll, const UChar *source, int32_t sourceLength, uint8_t *result, int32_t resultLength); /** Gets the next count bytes of a sort key. Caller needs * to preserve state array between calls and to provide * the same type of UCharIterator set with the same string. * The destination buffer provided must be big enough to store * the number of requested bytes. * * The generated sort key may or may not be compatible with * sort keys generated using ucol_getSortKey(). * @param coll The UCollator containing the collation rules. * @param iter UCharIterator containing the string we need * the sort key to be calculated for. * @param state Opaque state of sortkey iteration. * @param dest Buffer to hold the resulting sortkey part * @param count number of sort key bytes required. * @param status error code indicator. * @return the actual number of bytes of a sortkey. It can be * smaller than count if we have reached the end of * the sort key. * @stable ICU 2.6 */ U_CAPI int32_t U_EXPORT2 ucol_nextSortKeyPart(const UCollator *coll, UCharIterator *iter, uint32_t state[2], uint8_t *dest, int32_t count, UErrorCode *status); /** enum that is taken by ucol_getBound API * See below for explanation * do not change the values assigned to the * members of this enum. Underlying code * depends on them having these numbers * @stable ICU 2.0 */ typedef enum { /** lower bound */ UCOL_BOUND_LOWER = 0, /** upper bound that will match strings of exact size */ UCOL_BOUND_UPPER = 1, /** upper bound that will match all the strings that have the same initial substring as the given string */ UCOL_BOUND_UPPER_LONG = 2, } UColBoundMode; /** * Produce a bound for a given sortkey and a number of levels. * Return value is always the number of bytes needed, regardless of * whether the result buffer was big enough or even valid.
* Resulting bounds can be used to produce a range of strings that are * between upper and lower bounds. For example, if bounds are produced * for a sortkey of string "smith", strings between upper and lower * bounds with one level would include "Smith", "SMITH", "sMiTh".
* There are two upper bounds that can be produced. If UCOL_BOUND_UPPER * is produced, strings matched would be as above. However, if bound * produced using UCOL_BOUND_UPPER_LONG is used, the above example will * also match "Smithsonian" and similar.
* For more on usage, see example in cintltst/capitst.c in procedure * TestBounds. * Sort keys may be compared using strcmp. * @param source The source sortkey. * @param sourceLength The length of source, or -1 if null-terminated. * (If an unmodified sortkey is passed, it is always null * terminated). * @param boundType Type of bound required. It can be UCOL_BOUND_LOWER, which * produces a lower inclusive bound, UCOL_BOUND_UPPER, that * produces upper bound that matches strings of the same length * or UCOL_BOUND_UPPER_LONG that matches strings that have the * same starting substring as the source string. * @param noOfLevels Number of levels required in the resulting bound (for most * uses, the recommended value is 1). See users guide for * explanation on number of levels a sortkey can have. * @param result A pointer to a buffer to receive the resulting sortkey. * @param resultLength The maximum size of result. * @param status Used for returning error code if something went wrong. If the * number of levels requested is higher than the number of levels * in the source key, a warning (U_SORT_KEY_TOO_SHORT_WARNING) is * issued. * @return The size needed to fully store the bound. * @see ucol_keyHashCode * @stable ICU 2.1 */ U_CAPI int32_t U_EXPORT2 ucol_getBound(const uint8_t *source, int32_t sourceLength, UColBoundMode boundType, uint32_t noOfLevels, uint8_t *result, int32_t resultLength, UErrorCode *status); /** * Gets the version information for a Collator. Version is currently * an opaque 32-bit number which depends, among other things, on major * versions of the collator tailoring and UCA. * @param coll The UCollator to query. * @param info the version # information, the result will be filled in * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucol_getVersion(const UCollator* coll, UVersionInfo info); /** * Gets the UCA version information for a Collator. Version is the * UCA version number (3.1.1, 4.0). * @param coll The UCollator to query. * @param info the version # information, the result will be filled in * @stable ICU 2.8 */ U_CAPI void U_EXPORT2 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info); /** * Merges two sort keys. The levels are merged with their corresponding counterparts * (primaries with primaries, secondaries with secondaries etc.). Between the values * from the same level a separator is inserted. * * This is useful, for example, for combining sort keys from first and last names * to sort such pairs. * See http://www.unicode.org/reports/tr10/#Merging_Sort_Keys * * The recommended way to achieve "merged" sorting is by * concatenating strings with U+FFFE between them. * The concatenation has the same sort order as the merged sort keys, * but merge(getSortKey(str1), getSortKey(str2)) may differ from getSortKey(str1 + '\\uFFFE' + str2). * Using strings with U+FFFE may yield shorter sort keys. * * For details about Sort Key Features see * https://unicode-org.github.io/icu/userguide/collation/api#sort-key-features * * It is possible to merge multiple sort keys by consecutively merging * another one with the intermediate result. * * The length of the merge result is the sum of the lengths of the input sort keys. * * Example (uncompressed): *

191B1D 01 050505 01 910505 00
 * 1F2123 01 050505 01 910505 00
* will be merged as *
191B1D 02 1F2123 01 050505 02 050505 01 910505 02 910505 00
* * If the destination buffer is not big enough, then its contents are undefined. * If any of source lengths are zero or any of the source pointers are NULL/undefined, * the result is of size zero. * * @param src1 the first sort key * @param src1Length the length of the first sort key, including the zero byte at the end; * can be -1 if the function is to find the length * @param src2 the second sort key * @param src2Length the length of the second sort key, including the zero byte at the end; * can be -1 if the function is to find the length * @param dest the buffer where the merged sort key is written, * can be NULL if destCapacity==0 * @param destCapacity the number of bytes in the dest buffer * @return the length of the merged sort key, src1Length+src2Length; * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments), * in which cases the contents of dest is undefined * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, const uint8_t *src2, int32_t src2Length, uint8_t *dest, int32_t destCapacity); /** * Universal attribute setter * @param coll collator which attributes are to be changed * @param attr attribute type * @param value attribute value * @param status to indicate whether the operation went on smoothly or there were errors * @see UColAttribute * @see UColAttributeValue * @see ucol_getAttribute * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status); /** * Universal attribute getter * @param coll collator which attributes are to be changed * @param attr attribute type * @return attribute value * @param status to indicate whether the operation went on smoothly or there were errors * @see UColAttribute * @see UColAttributeValue * @see ucol_setAttribute * @stable ICU 2.0 */ U_CAPI UColAttributeValue U_EXPORT2 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status); /** * Sets the variable top to the top of the specified reordering group. * The variable top determines the highest-sorting character * which is affected by UCOL_ALTERNATE_HANDLING. * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect. * @param coll the collator * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION, * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY; * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @see ucol_getMaxVariable * @stable ICU 53 */ U_CAPI void U_EXPORT2 ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode); /** * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING. * @param coll the collator * @return the maximum variable reordering group. * @see ucol_setMaxVariable * @stable ICU 53 */ U_CAPI UColReorderCode U_EXPORT2 ucol_getMaxVariable(const UCollator *coll); /** * Gets the variable top value of a Collator. * @param coll collator which variable top needs to be retrieved * @param status error code (not changed by function). If error code is set, * the return value is undefined. * @return the variable top primary weight * @see ucol_getMaxVariable * @see ucol_setVariableTop * @see ucol_restoreVariableTop * @stable ICU 2.0 */ U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status); /** * Thread safe cloning operation. The result is a clone of a given collator. * @param coll collator to be cloned * @param stackBuffer Deprecated functionality as of ICU 52, use NULL.
* user allocated space for the new clone. * If NULL new memory will be allocated. * If buffer is not large enough, new memory will be allocated. * Clients can use the U_COL_SAFECLONE_BUFFERSIZE. * @param pBufferSize Deprecated functionality as of ICU 52, use NULL or 1.
* pointer to size of allocated space. * If *pBufferSize == 0, a sufficient size for use in cloning will * be returned ('pre-flighting') * If *pBufferSize is not enough for a stack-based safe clone, * new memory will be allocated. * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any * allocations were necessary. * @return pointer to the new clone * @see ucol_open * @see ucol_openRules * @see ucol_close * @stable ICU 2.0 */ U_CAPI UCollator* U_EXPORT2 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Thread safe cloning operation. The result is a clone of a given collator. * @param coll collator to be cloned * @param status to indicate whether the operation went on smoothly or there were errors * @return pointer to the new clone * @see ucol_open * @see ucol_openRules * @see ucol_close * @stable ICU 71 */ U_CAPI UCollator* U_EXPORT2 ucol_clone(const UCollator *coll, UErrorCode *status); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Returns current rules. Delta defines whether full rules are returned or just the tailoring. * Returns number of UChars needed to store rules. If buffer is NULL or bufferLen is not enough * to store rules, will store up to available space. * * ucol_getRules() should normally be used instead. * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales * @param coll collator to get the rules from * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES. * @param buffer buffer to store the result in. If NULL, you'll get no rules. * @param bufferLen length of buffer to store rules in. If less than needed you'll get only the part that fits in. * @return current rules * @stable ICU 2.0 * @see UCOL_FULL_RULES */ U_CAPI int32_t U_EXPORT2 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen); /** * gets the locale name of the collator. If the collator * is instantiated from the rules, then this function returns * NULL. * @param coll The UCollator for which the locale is needed * @param type You can choose between requested, valid and actual * locale. For description see the definition of * ULocDataLocaleType in uloc.h * @param status error code of the operation * @return real locale name from which the collation data comes. * If the collator was instantiated from rules, returns * NULL. * @stable ICU 2.8 */ U_CAPI const char * U_EXPORT2 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status); /** * Get a Unicode set that contains all the characters and sequences tailored in * this collator. The result must be disposed of by using uset_close. * @param coll The UCollator for which we want to get tailored chars * @param status error code of the operation * @return a pointer to newly created USet. Must be be disposed by using uset_close * @see ucol_openRules * @see uset_close * @stable ICU 2.4 */ U_CAPI USet * U_EXPORT2 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status); /** Creates a binary image of a collator. This binary image can be stored and * later used to instantiate a collator using ucol_openBinary. * This API supports preflighting. * @param coll Collator * @param buffer a fill-in buffer to receive the binary image * @param capacity capacity of the destination buffer * @param status for catching errors * @return size of the image * @see ucol_openBinary * @stable ICU 3.2 */ U_CAPI int32_t U_EXPORT2 ucol_cloneBinary(const UCollator *coll, uint8_t *buffer, int32_t capacity, UErrorCode *status); /** Opens a collator from a collator binary image created using * ucol_cloneBinary. Binary image used in instantiation of the * collator remains owned by the user and should stay around for * the lifetime of the collator. The API also takes a base collator * which must be the root collator. * @param bin binary image owned by the user and required through the * lifetime of the collator * @param length size of the image. If negative, the API will try to * figure out the length of the image * @param base Base collator, for lookup of untailored characters. * Must be the root collator, must not be NULL. * The base is required to be present through the lifetime of the collator. * @param status for catching errors * @return newly created collator * @see ucol_cloneBinary * @stable ICU 3.2 */ U_CAPI UCollator* U_EXPORT2 ucol_openBinary(const uint8_t *bin, int32_t length, const UCollator *base, UErrorCode *status); #endif /* #if !UCONFIG_NO_COLLATION */ #endif // ucoleitr.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2001-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * * File ucoleitr.h * * Modification History: * * Date Name Description * 02/15/2001 synwee Modified all methods to process its own function * instead of calling the equivalent c++ api (coleitr.h) *******************************************************************************/ #ifndef UCOLEITR_H #define UCOLEITR_H #if !UCONFIG_NO_COLLATION /** * This indicates an error has occurred during processing or if no more CEs is * to be returned. * @stable ICU 2.0 */ #define UCOL_NULLORDER ((int32_t)0xFFFFFFFF) /** * The UCollationElements struct. * For usage in C programs. * @stable ICU 2.0 */ typedef struct UCollationElements UCollationElements; /** * \file * \brief C API: UCollationElements * * The UCollationElements API is used as an iterator to walk through each * character of an international string. Use the iterator to return the * ordering priority of the positioned character. The ordering priority of a * character, which we refer to as a key, defines how a character is collated * in the given collation object. * For example, consider the following in Slovak and in traditional Spanish collation: *
 * .       "ca" -> the first key is key('c') and second key is key('a').
 * .       "cha" -> the first key is key('ch') and second key is key('a').
 * 
* And in German phonebook collation, *
 * .       "b"-> the first key is key('a'), the second key is key('e'), and
 * .       the third key is key('b').
 * 
*

Example of the iterator usage: (without error checking) *

 * .  void CollationElementIterator_Example()
 * .  {
 * .      UChar *s;
 * .      t_int32 order, primaryOrder;
 * .      UCollationElements *c;
 * .      UCollatorOld *coll;
 * .      UErrorCode success = U_ZERO_ERROR;
 * .      str=(UChar*)malloc(sizeof(UChar) * (strlen("This is a test")+1) );
 * .      u_uastrcpy(str, "This is a test");
 * .      coll = ucol_open(NULL, &success);
 * .      c = ucol_openElements(coll, str, u_strlen(str), &status);
 * .      order = ucol_next(c, &success);
 * .      ucol_reset(c);
 * .      order = ucol_prev(c, &success);
 * .      free(str);
 * .      ucol_close(coll);
 * .      ucol_closeElements(c);
 * .  }
 * 
*

* ucol_next() returns the collation order of the next. * ucol_prev() returns the collation order of the previous character. * The Collation Element Iterator moves only in one direction between calls to * ucol_reset. That is, ucol_next() and ucol_prev can not be inter-used. * Whenever ucol_prev is to be called after ucol_next() or vice versa, * ucol_reset has to be called first to reset the status, shifting pointers to * either the end or the start of the string. Hence at the next call of * ucol_prev or ucol_next, the first or last collation order will be returned. * If a change of direction is done without a ucol_reset, the result is * undefined. * The result of a forward iterate (ucol_next) and reversed result of the * backward iterate (ucol_prev) on the same string are equivalent, if * collation orders with the value 0 are ignored. * Character based on the comparison level of the collator. A collation order * consists of primary order, secondary order and tertiary order. The data * type of the collation order is int32_t. * * @see UCollator */ /** * Open the collation elements for a string. * * The UCollationElements retains a pointer to the supplied text. * The caller must not modify or delete the text while the UCollationElements * object is used to iterate over this text. * * @param coll The collator containing the desired collation rules. * @param text The text to iterate over. * @param textLength The number of characters in text, or -1 if null-terminated * @param status A pointer to a UErrorCode to receive any errors. * @return a struct containing collation element information * @stable ICU 2.0 */ U_CAPI UCollationElements* U_EXPORT2 ucol_openElements(const UCollator *coll, const UChar *text, int32_t textLength, UErrorCode *status); /** * get a hash code for a key... Not very useful! * @param key the given key. * @param length the size of the key array. * @return the hash code. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_keyHashCode(const uint8_t* key, int32_t length); /** * Close a UCollationElements. * Once closed, a UCollationElements may no longer be used. * @param elems The UCollationElements to close. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucol_closeElements(UCollationElements *elems); /** * Reset the collation elements to their initial state. * This will move the 'cursor' to the beginning of the text. * Property settings for collation will be reset to the current status. * @param elems The UCollationElements to reset. * @see ucol_next * @see ucol_previous * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucol_reset(UCollationElements *elems); /** * Get the ordering priority of the next collation element in the text. * A single character may contain more than one collation element. * @param elems The UCollationElements containing the text. * @param status A pointer to a UErrorCode to receive any errors. * @return The next collation elements ordering, otherwise returns UCOL_NULLORDER * if an error has occurred or if the end of string has been reached * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_next(UCollationElements *elems, UErrorCode *status); /** * Get the ordering priority of the previous collation element in the text. * A single character may contain more than one collation element. * Note that internally a stack is used to store buffered collation elements. * @param elems The UCollationElements containing the text. * @param status A pointer to a UErrorCode to receive any errors. Notably * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack * buffer has been exhausted. * @return The previous collation elements ordering, otherwise returns * UCOL_NULLORDER if an error has occurred or if the start of string has * been reached. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_previous(UCollationElements *elems, UErrorCode *status); /** * Get the maximum length of any expansion sequences that end with the * specified comparison order. * This is useful for .... ? * @param elems The UCollationElements containing the text. * @param order A collation order returned by previous or next. * @return maximum size of the expansion sequences ending with the collation * element or 1 if collation element does not occur at the end of any * expansion sequence * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_getMaxExpansion(const UCollationElements *elems, int32_t order); /** * Set the text containing the collation elements. * Property settings for collation will remain the same. * In order to reset the iterator to the current collation property settings, * the API reset() has to be called. * * The UCollationElements retains a pointer to the supplied text. * The caller must not modify or delete the text while the UCollationElements * object is used to iterate over this text. * * @param elems The UCollationElements to set. * @param text The source text containing the collation elements. * @param textLength The length of text, or -1 if null-terminated. * @param status A pointer to a UErrorCode to receive any errors. * @see ucol_getText * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucol_setText( UCollationElements *elems, const UChar *text, int32_t textLength, UErrorCode *status); /** * Get the offset of the current source character. * This is an offset into the text of the character containing the current * collation elements. * @param elems The UCollationElements to query. * @return The offset of the current source character. * @see ucol_setOffset * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 ucol_getOffset(const UCollationElements *elems); /** * Set the offset of the current source character. * This is an offset into the text of the character to be processed. * Property settings for collation will remain the same. * In order to reset the iterator to the current collation property settings, * the API reset() has to be called. * @param elems The UCollationElements to set. * @param offset The desired character offset. * @param status A pointer to a UErrorCode to receive any errors. * @see ucol_getOffset * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 ucol_setOffset(UCollationElements *elems, int32_t offset, UErrorCode *status); /** * Get the primary order of a collation order. * @param order the collation order * @return the primary order of a collation order. * @stable ICU 2.6 */ U_CAPI int32_t U_EXPORT2 ucol_primaryOrder (int32_t order); /** * Get the secondary order of a collation order. * @param order the collation order * @return the secondary order of a collation order. * @stable ICU 2.6 */ U_CAPI int32_t U_EXPORT2 ucol_secondaryOrder (int32_t order); /** * Get the tertiary order of a collation order. * @param order the collation order * @return the tertiary order of a collation order. * @stable ICU 2.6 */ U_CAPI int32_t U_EXPORT2 ucol_tertiaryOrder (int32_t order); #endif /* #if !UCONFIG_NO_COLLATION */ #endif // ucsdet.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucsdet.h * encoding: UTF-8 * indentation:4 * * created on: 2005Aug04 * created by: Andy Heninger * * ICU Character Set Detection, API for C * * Draft version 18 Oct 2005 * */ #ifndef __UCSDET_H #define __UCSDET_H #if !UCONFIG_NO_CONVERSION /** * \file * \brief C API: Charset Detection API * * This API provides a facility for detecting the * charset or encoding of character data in an unknown text format. * The input data can be from an array of bytes. *

* Character set detection is at best an imprecise operation. The detection * process will attempt to identify the charset that best matches the characteristics * of the byte data, but the process is partly statistical in nature, and * the results can not be guaranteed to always be correct. *

* For best accuracy in charset detection, the input data should be primarily * in a single language, and a minimum of a few hundred bytes worth of plain text * in the language are needed. The detection process will attempt to * ignore html or xml style markup that could otherwise obscure the content. *

* An alternative to the ICU Charset Detector is the * Compact Encoding Detector, https://github.com/google/compact_enc_det. * It often gives more accurate results, especially with short input samples. */ struct UCharsetDetector; /** * Structure representing a charset detector * @stable ICU 3.6 */ typedef struct UCharsetDetector UCharsetDetector; struct UCharsetMatch; /** * Opaque structure representing a match that was identified * from a charset detection operation. * @stable ICU 3.6 */ typedef struct UCharsetMatch UCharsetMatch; /** * Open a charset detector. * * @param status Any error conditions occurring during the open * operation are reported back in this variable. * @return the newly opened charset detector. * @stable ICU 3.6 */ U_CAPI UCharsetDetector * U_EXPORT2 ucsdet_open(UErrorCode *status); /** * Close a charset detector. All storage and any other resources * owned by this charset detector will be released. Failure to * close a charset detector when finished with it can result in * memory leaks in the application. * * @param ucsd The charset detector to be closed. * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ucsdet_close(UCharsetDetector *ucsd); /** * Set the input byte data whose charset is to detected. * * Ownership of the input text byte array remains with the caller. * The input string must not be altered or deleted until the charset * detector is either closed or reset to refer to different input text. * * @param ucsd the charset detector to be used. * @param textIn the input text of unknown encoding. . * @param len the length of the input text, or -1 if the text * is NUL terminated. * @param status any error conditions are reported back in this variable. * * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); /** Set the declared encoding for charset detection. * The declared encoding of an input text is an encoding obtained * by the user from an http header or xml declaration or similar source that * can be provided as an additional hint to the charset detector. * * How and whether the declared encoding will be used during the * detection process is TBD. * * @param ucsd the charset detector to be used. * @param encoding an encoding for the current data obtained from * a header or declaration or other source outside * of the byte data itself. * @param length the length of the encoding name, or -1 if the name string * is NUL terminated. * @param status any error conditions are reported back in this variable. * * @stable ICU 3.6 */ U_CAPI void U_EXPORT2 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); /** * Return the charset that best matches the supplied input data. * * Note though, that because the detection * only looks at the start of the input data, * there is a possibility that the returned charset will fail to handle * the full set of input data. *

* The returned UCharsetMatch object is owned by the UCharsetDetector. * It will remain valid until the detector input is reset, or until * the detector is closed. *

* The function will fail if *

    *
  • no charset appears to match the data.
  • *
  • no input text has been provided
  • *
* * @param ucsd the charset detector to be used. * @param status any error conditions are reported back in this variable. * @return a UCharsetMatch representing the best matching charset, * or NULL if no charset matches the byte data. * * @stable ICU 3.6 */ U_CAPI const UCharsetMatch * U_EXPORT2 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); /** * Find all charset matches that appear to be consistent with the input, * returning an array of results. The results are ordered with the * best quality match first. * * Because the detection only looks at a limited amount of the * input byte data, some of the returned charsets may fail to handle * the all of input data. *

* The returned UCharsetMatch objects are owned by the UCharsetDetector. * They will remain valid until the detector is closed or modified * *

* Return an error if *

    *
  • no charsets appear to match the input data.
  • *
  • no input text has been provided
  • *
* * @param ucsd the charset detector to be used. * @param matchesFound pointer to a variable that will be set to the * number of charsets identified that are consistent with * the input data. Output only. * @param status any error conditions are reported back in this variable. * @return A pointer to an array of pointers to UCharSetMatch objects. * This array, and the UCharSetMatch instances to which it refers, * are owned by the UCharsetDetector, and will remain valid until * the detector is closed or modified. * @stable ICU 3.6 */ U_CAPI const UCharsetMatch ** U_EXPORT2 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); /** * Get the name of the charset represented by a UCharsetMatch. * * The storage for the returned name string is owned by the * UCharsetMatch, and will remain valid while the UCharsetMatch * is valid. * * The name returned is suitable for use with the ICU conversion APIs. * * @param ucsm The charset match object. * @param status Any error conditions are reported back in this variable. * @return The name of the matching charset. * * @stable ICU 3.6 */ U_CAPI const char * U_EXPORT2 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); /** * Get a confidence number for the quality of the match of the byte * data with the charset. Confidence numbers range from zero to 100, * with 100 representing complete confidence and zero representing * no confidence. * * The confidence values are somewhat arbitrary. They define an * an ordering within the results for any single detection operation * but are not generally comparable between the results for different input. * * A confidence value of ten does have a general meaning - it is used * for charsets that can represent the input data, but for which there * is no other indication that suggests that the charset is the correct one. * Pure 7 bit ASCII data, for example, is compatible with a * great many charsets, most of which will appear as possible matches * with a confidence of 10. * * @param ucsm The charset match object. * @param status Any error conditions are reported back in this variable. * @return A confidence number for the charset match. * * @stable ICU 3.6 */ U_CAPI int32_t U_EXPORT2 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); /** * Get the RFC 3066 code for the language of the input data. * * The Charset Detection service is intended primarily for detecting * charsets, not language. For some, but not all, charsets, a language is * identified as a byproduct of the detection process, and that is what * is returned by this function. * * CAUTION: * 1. Language information is not available for input data encoded in * all charsets. In particular, no language is identified * for UTF-8 input data. * * 2. Closely related languages may sometimes be confused. * * If more accurate language detection is required, a linguistic * analysis package should be used. * * The storage for the returned name string is owned by the * UCharsetMatch, and will remain valid while the UCharsetMatch * is valid. * * @param ucsm The charset match object. * @param status Any error conditions are reported back in this variable. * @return The RFC 3066 code for the language of the input data, or * an empty string if the language could not be determined. * * @stable ICU 3.6 */ U_CAPI const char * U_EXPORT2 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); /** * Get the entire input text as a UChar string, placing it into * a caller-supplied buffer. A terminating * NUL character will be appended to the buffer if space is available. * * The number of UChars in the output string, not including the terminating * NUL, is returned. * * If the supplied buffer is smaller than required to hold the output, * the contents of the buffer are undefined. The full output string length * (in UChars) is returned as always, and can be used to allocate a buffer * of the correct size. * * * @param ucsm The charset match object. * @param buf A UChar buffer to be filled with the converted text data. * @param cap The capacity of the buffer in UChars. * @param status Any error conditions are reported back in this variable. * @return The number of UChars in the output string. * * @stable ICU 3.6 */ U_CAPI int32_t U_EXPORT2 ucsdet_getUChars(const UCharsetMatch *ucsm, UChar *buf, int32_t cap, UErrorCode *status); /** * Get an iterator over the set of all detectable charsets - * over the charsets that are known to the charset detection * service. * * The returned UEnumeration provides access to the names of * the charsets. * *

* The state of the Charset detector that is passed in does not * affect the result of this function, but requiring a valid, open * charset detector as a parameter insures that the charset detection * service has been safely initialized and that the required detection * data is available. * *

* Note: Multiple different charset encodings in a same family may use * a single shared name in this implementation. For example, this method returns * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" * (Windows Latin 1). However, actual detection result could be "windows-1252" * when the input data matches Latin 1 code points with any points only available * in "windows-1252". * * @param ucsd a Charset detector. * @param status Any error conditions are reported back in this variable. * @return an iterator providing access to the detectable charset names. * @stable ICU 3.6 */ U_CAPI UEnumeration * U_EXPORT2 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); /** * Test whether input filtering is enabled for this charset detector. * Input filtering removes text that appears to be HTML or xml * markup from the input before applying the code page detection * heuristics. * * @param ucsd The charset detector to check. * @return true if filtering is enabled. * @stable ICU 3.6 */ U_CAPI UBool U_EXPORT2 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); /** * Enable filtering of input text. If filtering is enabled, * text within angle brackets ("<" and ">") will be removed * before detection, which will remove most HTML or xml markup. * * @param ucsd the charset detector to be modified. * @param filter true to enable input text filtering. * @return The previous setting. * * @stable ICU 3.6 */ U_CAPI UBool U_EXPORT2 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); #endif #endif /* __UCSDET_H */ // udisplayoptions.h // No supported content // ufieldpositer.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2015-2016, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef UFIELDPOSITER_H #define UFIELDPOSITER_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: UFieldPositionIterator for use with format APIs. * * Usage: * ufieldpositer_open creates an empty (unset) UFieldPositionIterator. * This can be passed to format functions such as {@link #udat_formatForFields}, * which will set it to apply to the fields in a particular formatted string. * ufieldpositer_next can then be used to iterate over those fields, * providing for each field its type (using values that are specific to the * particular format type, such as date or number formats), as well as the * start and end positions of the field in the formatted string. * A given UFieldPositionIterator can be re-used for different format calls; * each such call resets it to apply to that format string. * ufieldpositer_close should be called to dispose of the UFieldPositionIterator * when it is no longer needed. * * @see FieldPositionIterator */ /** * Opaque UFieldPositionIterator object for use in C. * @stable ICU 55 */ struct UFieldPositionIterator; typedef struct UFieldPositionIterator UFieldPositionIterator; /**< C typedef for struct UFieldPositionIterator. @stable ICU 55 */ /** * Open a new, unset UFieldPositionIterator object. * @param status * A pointer to a UErrorCode to receive any errors. * @return * A pointer to an empty (unset) UFieldPositionIterator object, * or NULL if an error occurred. * @stable ICU 55 */ U_CAPI UFieldPositionIterator* U_EXPORT2 ufieldpositer_open(UErrorCode* status); /** * Close a UFieldPositionIterator object. Once closed it may no longer be used. * @param fpositer * A pointer to the UFieldPositionIterator object to close. * @stable ICU 55 */ U_CAPI void U_EXPORT2 ufieldpositer_close(UFieldPositionIterator *fpositer); /** * Get information for the next field in the formatted string to which this * UFieldPositionIterator currently applies, or return a negative value if there * are no more fields. * @param fpositer * A pointer to the UFieldPositionIterator object containing iteration * state for the format fields. * @param beginIndex * A pointer to an int32_t to receive information about the start offset * of the field in the formatted string (undefined if the function * returns a negative value). May be NULL if this information is not needed. * @param endIndex * A pointer to an int32_t to receive information about the end offset * of the field in the formatted string (undefined if the function * returns a negative value). May be NULL if this information is not needed. * @return * The field type (non-negative value), or a negative value if there are * no more fields for which to provide information. If negative, then any * values pointed to by beginIndex and endIndex are undefined. * * The values for field type depend on what type of formatter the * UFieldPositionIterator has been set by; for a date formatter, the * values from the UDateFormatField enum. For more information, see the * descriptions of format functions that take a UFieldPositionIterator* * parameter, such as {@link #udat_formatForFields}. * * @stable ICU 55 */ U_CAPI int32_t U_EXPORT2 ufieldpositer_next(UFieldPositionIterator *fpositer, int32_t *beginIndex, int32_t *endIndex); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // uformattable.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************** * Copyright (C) 2013-2014, International Business Machines Corporation and others. * All Rights Reserved. ******************************************************************************** * * File UFORMATTABLE.H * * Modification History: * * Date Name Description * 2013 Jun 7 srl New ******************************************************************************** */ /** * \file * \brief C API: UFormattable is a thin wrapper for primitive types used for formatting and parsing. * * This is a C interface to the icu::Formattable class. Static functions on this class convert * to and from this interface (via reinterpret_cast). Note that Formattables (and thus UFormattables) * are mutable, and many operations (even getters) may actually modify the internal state. For this * reason, UFormattables are not thread safe, and should not be shared between threads. * * See {@link unum_parseToUFormattable} for example code. */ #ifndef UFORMATTABLE_H #define UFORMATTABLE_H #if !UCONFIG_NO_FORMATTING /** * Enum designating the type of a UFormattable instance. * Practically, this indicates which of the getters would return without conversion * or error. * @see icu::Formattable::Type * @stable ICU 52 */ typedef enum UFormattableType { UFMT_DATE = 0, /**< ufmt_getDate() will return without conversion. @see ufmt_getDate*/ UFMT_DOUBLE, /**< ufmt_getDouble() will return without conversion. @see ufmt_getDouble*/ UFMT_LONG, /**< ufmt_getLong() will return without conversion. @see ufmt_getLong */ UFMT_STRING, /**< ufmt_getUChars() will return without conversion. @see ufmt_getUChars*/ UFMT_ARRAY, /**< ufmt_countArray() and ufmt_getArray() will return the value. @see ufmt_getArrayItemByIndex */ UFMT_INT64, /**< ufmt_getInt64() will return without conversion. @see ufmt_getInt64 */ UFMT_OBJECT, /**< ufmt_getObject() will return without conversion. @see ufmt_getObject*/ } UFormattableType; /** * Opaque type representing various types of data which may be used for formatting * and parsing operations. * @see icu::Formattable * @stable ICU 52 */ typedef void *UFormattable; /** * Initialize a UFormattable, to type UNUM_LONG, value 0 * may return error if memory allocation failed. * parameter status error code. * See {@link unum_parseToUFormattable} for example code. * @stable ICU 52 * @return the new UFormattable * @see ufmt_close * @see icu::Formattable::Formattable() */ U_CAPI UFormattable* U_EXPORT2 ufmt_open(UErrorCode* status); /** * Cleanup any additional memory allocated by this UFormattable. * @param fmt the formatter * @stable ICU 52 * @see ufmt_open */ U_CAPI void U_EXPORT2 ufmt_close(UFormattable* fmt); /** * Return the type of this object * @param fmt the UFormattable object * @param status status code - U_ILLEGAL_ARGUMENT_ERROR is returned if the UFormattable contains data not supported by * the API * @return the value as a UFormattableType * @see ufmt_isNumeric * @see icu::Formattable::getType() const * @stable ICU 52 */ U_CAPI UFormattableType U_EXPORT2 ufmt_getType(const UFormattable* fmt, UErrorCode *status); /** * Return whether the object is numeric. * @param fmt the UFormattable object * @return true if the object is a double, long, or int64 value, else false. * @see ufmt_getType * @see icu::Formattable::isNumeric() const * @stable ICU 52 */ U_CAPI UBool U_EXPORT2 ufmt_isNumeric(const UFormattable* fmt); /** * Gets the UDate value of this object. If the type is not of type UFMT_DATE, * status is set to U_INVALID_FORMAT_ERROR and the return value is * undefined. * @param fmt the UFormattable object * @param status the error code - any conversion or format errors * @return the value * @stable ICU 52 * @see icu::Formattable::getDate(UErrorCode&) const */ U_CAPI UDate U_EXPORT2 ufmt_getDate(const UFormattable* fmt, UErrorCode *status); /** * Gets the double value of this object. If the type is not a UFMT_DOUBLE, or * if there are additional significant digits than fit in a double type, * a conversion is performed with possible loss of precision. * If the type is UFMT_OBJECT and the * object is a Measure, then the result of * getNumber().getDouble(status) is returned. If this object is * neither a numeric type nor a Measure, then 0 is returned and * the status is set to U_INVALID_FORMAT_ERROR. * @param fmt the UFormattable object * @param status the error code - any conversion or format errors * @return the value * @stable ICU 52 * @see icu::Formattable::getDouble(UErrorCode&) const */ U_CAPI double U_EXPORT2 ufmt_getDouble(UFormattable* fmt, UErrorCode *status); /** * Gets the long (int32_t) value of this object. If the magnitude is too * large to fit in a long, then the maximum or minimum long value, * as appropriate, is returned and the status is set to * U_INVALID_FORMAT_ERROR. If this object is of type UFMT_INT64 and * it fits within a long, then no precision is lost. If it is of * type kDouble or kDecimalNumber, then a conversion is peformed, with * truncation of any fractional part. If the type is UFMT_OBJECT and * the object is a Measure, then the result of * getNumber().getLong(status) is returned. If this object is * neither a numeric type nor a Measure, then 0 is returned and * the status is set to U_INVALID_FORMAT_ERROR. * @param fmt the UFormattable object * @param status the error code - any conversion or format errors * @return the value * @stable ICU 52 * @see icu::Formattable::getLong(UErrorCode&) const */ U_CAPI int32_t U_EXPORT2 ufmt_getLong(UFormattable* fmt, UErrorCode *status); /** * Gets the int64_t value of this object. If this object is of a numeric * type and the magnitude is too large to fit in an int64, then * the maximum or minimum int64 value, as appropriate, is returned * and the status is set to U_INVALID_FORMAT_ERROR. If the * magnitude fits in an int64, then a casting conversion is * peformed, with truncation of any fractional part. If the type * is UFMT_OBJECT and the object is a Measure, then the result of * getNumber().getDouble(status) is returned. If this object is * neither a numeric type nor a Measure, then 0 is returned and * the status is set to U_INVALID_FORMAT_ERROR. * @param fmt the UFormattable object * @param status the error code - any conversion or format errors * @return the value * @stable ICU 52 * @see icu::Formattable::getInt64(UErrorCode&) const */ U_CAPI int64_t U_EXPORT2 ufmt_getInt64(UFormattable* fmt, UErrorCode *status); /** * Returns a pointer to the UObject contained within this * formattable (as a const void*), or NULL if this object * is not of type UFMT_OBJECT. * @param fmt the UFormattable object * @param status the error code - any conversion or format errors * @return the value as a const void*. It is a polymorphic C++ object. * @stable ICU 52 * @see icu::Formattable::getObject() const */ U_CAPI const void *U_EXPORT2 ufmt_getObject(const UFormattable* fmt, UErrorCode *status); /** * Gets the string value of this object as a UChar string. If the type is not a * string, status is set to U_INVALID_FORMAT_ERROR and a NULL pointer is returned. * This function is not thread safe and may modify the UFormattable if need be to terminate the string. * The returned pointer is not valid if any other functions are called on this UFormattable, or if the UFormattable is closed. * @param fmt the UFormattable object * @param status the error code - any conversion or format errors * @param len if non null, contains the string length on return * @return the null terminated string value - must not be referenced after any other functions are called on this UFormattable. * @stable ICU 52 * @see icu::Formattable::getString(UnicodeString&)const */ U_CAPI const UChar* U_EXPORT2 ufmt_getUChars(UFormattable* fmt, int32_t *len, UErrorCode *status); /** * Get the number of array objects contained, if an array type UFMT_ARRAY * @param fmt the UFormattable object * @param status the error code - any conversion or format errors. U_ILLEGAL_ARGUMENT_ERROR if not an array type. * @return the number of array objects or undefined if not an array type * @stable ICU 52 * @see ufmt_getArrayItemByIndex */ U_CAPI int32_t U_EXPORT2 ufmt_getArrayLength(const UFormattable* fmt, UErrorCode *status); /** * Get the specified value from the array of UFormattables. Invalid if the object is not an array type UFMT_ARRAY * @param fmt the UFormattable object * @param n the number of the array to return (0 based). * @param status the error code - any conversion or format errors. Returns an error if n is out of bounds. * @return the nth array value, only valid while the containing UFormattable is valid. NULL if not an array. * @stable ICU 52 * @see icu::Formattable::getArray(int32_t&, UErrorCode&) const */ U_CAPI UFormattable * U_EXPORT2 ufmt_getArrayItemByIndex(UFormattable* fmt, int32_t n, UErrorCode *status); /** * Returns a numeric string representation of the number contained within this * formattable, or NULL if this object does not contain numeric type. * For values obtained by parsing, the returned decimal number retains * the full precision and range of the original input, unconstrained by * the limits of a double floating point or a 64 bit int. * * This function is not thread safe, and therefore is not declared const, * even though it is logically const. * The resulting buffer is owned by the UFormattable and is invalid if any other functions are * called on the UFormattable. * * Possible errors include U_MEMORY_ALLOCATION_ERROR, and * U_INVALID_STATE if the formattable object has not been set to * a numeric type. * @param fmt the UFormattable object * @param len if non-null, on exit contains the string length (not including the terminating null) * @param status the error code * @return the character buffer as a NULL terminated string, which is owned by the object and must not be accessed if any other functions are called on this object. * @stable ICU 52 * @see icu::Formattable::getDecimalNumber(UErrorCode&) */ U_CAPI const char * U_EXPORT2 ufmt_getDecNumChars(UFormattable *fmt, int32_t *len, UErrorCode *status); #endif #endif #if (NTDDI_VERSION >= NTDDI_WIN10_CO) // uformattedvalue.h // Copyright (C) 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #ifndef __UFORMATTEDVALUE_H__ #define __UFORMATTEDVALUE_H__ #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Abstract operations for localized strings. * * This file contains declarations for classes that deal with formatted strings. A number * of APIs throughout ICU use these classes for expressing their localized output. */ /** * All possible field categories in ICU. Every entry in this enum corresponds * to another enum that exists in ICU. * * In the APIs that take a UFieldCategory, an int32_t type is used. Field * categories having any of the top four bits turned on are reserved as * private-use for external APIs implementing FormattedValue. This means that * categories 2^28 and higher or below zero (with the highest bit turned on) * are private-use and will not be used by ICU in the future. * * @stable ICU 64 */ typedef enum UFieldCategory { /** * For an undefined field category. * * @stable ICU 64 */ UFIELD_CATEGORY_UNDEFINED = 0, /** * For fields in UDateFormatField (udat.h), from ICU 3.0. * * @stable ICU 64 */ UFIELD_CATEGORY_DATE, /** * For fields in UNumberFormatFields (unum.h), from ICU 49. * * @stable ICU 64 */ UFIELD_CATEGORY_NUMBER, /** * For fields in UListFormatterField (ulistformatter.h), from ICU 63. * * @stable ICU 64 */ UFIELD_CATEGORY_LIST, /** * For fields in URelativeDateTimeFormatterField (ureldatefmt.h), from ICU 64. * * @stable ICU 64 */ UFIELD_CATEGORY_RELATIVE_DATETIME, /** * Reserved for possible future fields in UDateIntervalFormatField. * * @internal */ UFIELD_CATEGORY_DATE_INTERVAL, /** * Category for spans in a list. * * @stable ICU 64 */ UFIELD_CATEGORY_LIST_SPAN = 0x1000 + UFIELD_CATEGORY_LIST, /** * Category for spans in a date interval. * * @stable ICU 64 */ UFIELD_CATEGORY_DATE_INTERVAL_SPAN = 0x1000 + UFIELD_CATEGORY_DATE_INTERVAL, #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Category for spans in a number range. * * @stable ICU 69 */ UFIELD_CATEGORY_NUMBER_RANGE_SPAN = 0x1000 + UFIELD_CATEGORY_NUMBER, #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) } UFieldCategory; struct UConstrainedFieldPosition; /** * Represents a span of a string containing a given field. * * This struct differs from UFieldPosition in the following ways: * * 1. It has information on the field category. * 2. It allows you to set constraints to use when iterating over field positions. * 3. It is used for the newer FormattedValue APIs. * * @stable ICU 64 */ typedef struct UConstrainedFieldPosition UConstrainedFieldPosition; /** * Creates a new UConstrainedFieldPosition. * * By default, the UConstrainedFieldPosition has no iteration constraints. * * @param ec Set if an error occurs. * @return The new object, or NULL if an error occurs. * @stable ICU 64 */ U_CAPI UConstrainedFieldPosition* U_EXPORT2 ucfpos_open(UErrorCode* ec); /** * Resets a UConstrainedFieldPosition to its initial state, as if it were newly created. * * Removes any constraints that may have been set on the instance. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param ec Set if an error occurs. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ucfpos_reset( UConstrainedFieldPosition* ucfpos, UErrorCode* ec); /** * Destroys a UConstrainedFieldPosition and releases its memory. * * @param ucfpos The instance of UConstrainedFieldPosition. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ucfpos_close(UConstrainedFieldPosition* ucfpos); /** * Sets a constraint on the field category. * * When this instance of UConstrainedFieldPosition is passed to ufmtval_nextPosition, * positions are skipped unless they have the given category. * * Any previously set constraints are cleared. * * For example, to loop over only the number-related fields: * * UConstrainedFieldPosition* ucfpos = ucfpos_open(ec); * ucfpos_constrainCategory(ucfpos, UFIELDCATEGORY_NUMBER_FORMAT, ec); * while (ufmtval_nextPosition(ufmtval, ucfpos, ec)) { * // handle the number-related field position * } * ucfpos_close(ucfpos); * * Changing the constraint while in the middle of iterating over a FormattedValue * does not generally have well-defined behavior. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param category The field category to fix when iterating. * @param ec Set if an error occurs. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ucfpos_constrainCategory( UConstrainedFieldPosition* ucfpos, int32_t category, UErrorCode* ec); /** * Sets a constraint on the category and field. * * When this instance of UConstrainedFieldPosition is passed to ufmtval_nextPosition, * positions are skipped unless they have the given category and field. * * Any previously set constraints are cleared. * * For example, to loop over all grouping separators: * * UConstrainedFieldPosition* ucfpos = ucfpos_open(ec); * ucfpos_constrainField(ucfpos, UFIELDCATEGORY_NUMBER_FORMAT, UNUM_GROUPING_SEPARATOR_FIELD, ec); * while (ufmtval_nextPosition(ufmtval, ucfpos, ec)) { * // handle the grouping separator position * } * ucfpos_close(ucfpos); * * Changing the constraint while in the middle of iterating over a FormattedValue * does not generally have well-defined behavior. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param category The field category to fix when iterating. * @param field The field to fix when iterating. * @param ec Set if an error occurs. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ucfpos_constrainField( UConstrainedFieldPosition* ucfpos, int32_t category, int32_t field, UErrorCode* ec); /** * Gets the field category for the current position. * * If a category or field constraint was set, this function returns the constrained * category. Otherwise, the return value is well-defined only after * ufmtval_nextPosition returns true. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param ec Set if an error occurs. * @return The field category saved in the instance. * @stable ICU 64 */ U_CAPI int32_t U_EXPORT2 ucfpos_getCategory( const UConstrainedFieldPosition* ucfpos, UErrorCode* ec); /** * Gets the field for the current position. * * If a field constraint was set, this function returns the constrained * field. Otherwise, the return value is well-defined only after * ufmtval_nextPosition returns true. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param ec Set if an error occurs. * @return The field saved in the instance. * @stable ICU 64 */ U_CAPI int32_t U_EXPORT2 ucfpos_getField( const UConstrainedFieldPosition* ucfpos, UErrorCode* ec); /** * Gets the INCLUSIVE start and EXCLUSIVE end index stored for the current position. * * The output values are well-defined only after ufmtval_nextPosition returns true. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param pStart Set to the start index saved in the instance. Ignored if nullptr. * @param pLimit Set to the end index saved in the instance. Ignored if nullptr. * @param ec Set if an error occurs. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ucfpos_getIndexes( const UConstrainedFieldPosition* ucfpos, int32_t* pStart, int32_t* pLimit, UErrorCode* ec); /** * Gets an int64 that FormattedValue implementations may use for storage. * * The initial value is zero. * * Users of FormattedValue should not need to call this method. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param ec Set if an error occurs. * @return The current iteration context from ucfpos_setInt64IterationContext. * @stable ICU 64 */ U_CAPI int64_t U_EXPORT2 ucfpos_getInt64IterationContext( const UConstrainedFieldPosition* ucfpos, UErrorCode* ec); /** * Sets an int64 that FormattedValue implementations may use for storage. * * Intended to be used by FormattedValue implementations. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param context The new iteration context. * @param ec Set if an error occurs. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ucfpos_setInt64IterationContext( UConstrainedFieldPosition* ucfpos, int64_t context, UErrorCode* ec); /** * Determines whether a given field should be included given the * constraints. * * Intended to be used by FormattedValue implementations. * * @param ucfpos The instance of UConstrainedFieldPosition. * @param category The category to test. * @param field The field to test. * @param ec Set if an error occurs. * @stable ICU 64 */ U_CAPI UBool U_EXPORT2 ucfpos_matchesField( const UConstrainedFieldPosition* ucfpos, int32_t category, int32_t field, UErrorCode* ec); /** * Sets new values for the primary public getters. * * Intended to be used by FormattedValue implementations. * * It is up to the implementation to ensure that the user-requested * constraints are satisfied. This method does not check! * * @param ucfpos The instance of UConstrainedFieldPosition. * @param category The new field category. * @param field The new field. * @param start The new inclusive start index. * @param limit The new exclusive end index. * @param ec Set if an error occurs. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ucfpos_setState( UConstrainedFieldPosition* ucfpos, int32_t category, int32_t field, int32_t start, int32_t limit, UErrorCode* ec); struct UFormattedValue; /** * An abstract formatted value: a string with associated field attributes. * Many formatters format to types compatible with UFormattedValue. * * @stable ICU 64 */ typedef struct UFormattedValue UFormattedValue; /** * Returns a pointer to the formatted string. The pointer is owned by the UFormattedValue. The * return value is valid only as long as the UFormattedValue is present and unchanged in memory. * * The return value is NUL-terminated but could contain internal NULs. * * @param ufmtval * The object containing the formatted string and attributes. * @param pLength Output variable for the length of the string. Ignored if NULL. * @param ec Set if an error occurs. * @return A NUL-terminated char16 string owned by the UFormattedValue. * @stable ICU 64 */ U_CAPI const UChar* U_EXPORT2 ufmtval_getString( const UFormattedValue* ufmtval, int32_t* pLength, UErrorCode* ec); /** * Iterates over field positions in the UFormattedValue. This lets you determine the position * of specific types of substrings, like a month or a decimal separator. * * To loop over all field positions: * * UConstrainedFieldPosition* ucfpos = ucfpos_open(ec); * while (ufmtval_nextPosition(ufmtval, ucfpos, ec)) { * // handle the field position; get information from ucfpos * } * ucfpos_close(ucfpos); * * @param ufmtval * The object containing the formatted string and attributes. * @param ucfpos * The object used for iteration state; can provide constraints to iterate over only * one specific category or field; * see ucfpos_constrainCategory * and ucfpos_constrainField. * @param ec Set if an error occurs. * @return true if another position was found; false otherwise. * @stable ICU 64 */ U_CAPI UBool U_EXPORT2 ufmtval_nextPosition( const UFormattedValue* ufmtval, UConstrainedFieldPosition* ucfpos, UErrorCode* ec); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // __UFORMATTEDVALUE_H__ #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) // udateintervalformat.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2010-2012,2015 International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef UDATEINTERVALFORMAT_H #define UDATEINTERVALFORMAT_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Format a date interval. * * A UDateIntervalFormat is used to format the range between two UDate values * in a locale-sensitive way, using a skeleton that specifies the precision and * completeness of the information to show. If the range smaller than the resolution * specified by the skeleton, a single date format will be produced. If the range * is larger than the format specified by the skeleton, a locale-specific fallback * will be used to format the items missing from the skeleton. * * For example, if the range is 2010-03-04 07:56 - 2010-03-04 19:56 (12 hours) * - The skeleton jm will produce * for en_US, "7:56 AM - 7:56 PM" * for en_GB, "7:56 - 19:56" * - The skeleton MMMd will produce * for en_US, "Mar 4" * for en_GB, "4 Mar" * If the range is 2010-03-04 07:56 - 2010-03-08 16:11 (4 days, 8 hours, 15 minutes) * - The skeleton jm will produce * for en_US, "3/4/2010 7:56 AM - 3/8/2010 4:11 PM" * for en_GB, "4/3/2010 7:56 - 8/3/2010 16:11" * - The skeleton MMMd will produce * for en_US, "Mar 4-8" * for en_GB, "4-8 Mar" * * Note: the "-" characters in the above sample output will actually be * Unicode 2013, EN_DASH, in all but the last example. * * Note, in ICU 4.4 the standard skeletons for which date interval format data * is usually available are as follows; best results will be obtained by using * skeletons from this set, or those formed by combining these standard skeletons * (note that for these skeletons, the length of digit field such as d, y, or * M vs MM is irrelevant (but for non-digit fields such as MMM vs MMMM it is * relevant). Note that a skeleton involving h or H generally explicitly requests * that time style (12- or 24-hour time respectively). For a skeleton that * requests the locale's default time style (h or H), use 'j' instead of h or H. * h, H, hm, Hm, * hv, Hv, hmv, Hmv, * d, * M, MMM, MMMM, * Md, MMMd, * MEd, MMMEd, * y, * yM, yMMM, yMMMM, * yMd, yMMMd, * yMEd, yMMMEd * * Locales for which ICU 4.4 seems to have a reasonable amount of this data * include: * af, am, ar, be, bg, bn, ca, cs, da, de (_AT), el, en (_AU,_CA,_GB,_IE,_IN...), * eo, es (_AR,_CL,_CO,...,_US) et, fa, fi, fo, fr (_BE,_CH,_CA), fur, gsw, he, * hr, hu, hy, is, it (_CH), ja, kk, km, ko, lt, lv, mk, ml, mt, nb, nl )_BE), * nn, pl, pt (_PT), rm, ro, ru (_UA), sk, sl, so, sq, sr, sr_Latn, sv, th, to, * tr, uk, ur, vi, zh (_SG), zh_Hant (_HK,_MO) */ /** * Opaque UDateIntervalFormat object for use in C programs. * @stable ICU 4.8 */ struct UDateIntervalFormat; typedef struct UDateIntervalFormat UDateIntervalFormat; /**< C typedef for struct UDateIntervalFormat. @stable ICU 4.8 */ #if (NTDDI_VERSION >= NTDDI_WIN10_CO) struct UFormattedDateInterval; /** * Opaque struct to contain the results of a UDateIntervalFormat operation. * @stable ICU 64 */ typedef struct UFormattedDateInterval UFormattedDateInterval; #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Open a new UDateIntervalFormat object using the predefined rules for a * given locale plus a specified skeleton. * @param locale * The locale for whose rules should be used; may be NULL for * default locale. * @param skeleton * A pattern containing only the fields desired for the interval * format, for example "Hm", "yMMMd", or "yMMMEdHm". * @param skeletonLength * The length of skeleton; may be -1 if the skeleton is zero-terminated. * @param tzID * A timezone ID specifying the timezone to use. If 0, use the default * timezone. * @param tzIDLength * The length of tzID, or -1 if null-terminated. If 0, use the default * timezone. * @param status * A pointer to a UErrorCode to receive any errors. * @return * A pointer to a UDateIntervalFormat object for the specified locale, * or NULL if an error occurred. * @stable ICU 4.8 */ U_CAPI UDateIntervalFormat* U_EXPORT2 udtitvfmt_open(const char* locale, const UChar* skeleton, int32_t skeletonLength, const UChar* tzID, int32_t tzIDLength, UErrorCode* status); /** * Close a UDateIntervalFormat object. Once closed it may no longer be used. * @param formatter * The UDateIntervalFormat object to close. * @stable ICU 4.8 */ U_CAPI void U_EXPORT2 udtitvfmt_close(UDateIntervalFormat *formatter); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Creates an object to hold the result of a UDateIntervalFormat * operation. The object can be used repeatedly; it is cleared whenever * passed to a format function. * * @param ec Set if an error occurs. * @return A pointer needing ownership. * @stable ICU 64 */ U_CAPI UFormattedDateInterval* U_EXPORT2 udtitvfmt_openResult(UErrorCode* ec); /** * Returns a representation of a UFormattedDateInterval as a UFormattedValue, * which can be subsequently passed to any API requiring that type. * * The returned object is owned by the UFormattedDateInterval and is valid * only as long as the UFormattedDateInterval is present and unchanged in memory. * * You can think of this method as a cast between types. * * When calling ufmtval_nextPosition(): * The fields are returned from left to right. The special field category * UFIELD_CATEGORY_DATE_INTERVAL_SPAN is used to indicate which datetime * primitives came from which arguments: 0 means fromCalendar, and 1 means * toCalendar. The span category will always occur before the * corresponding fields in UFIELD_CATEGORY_DATE * in the ufmtval_nextPosition() iterator. * * @param uresult The object containing the formatted string. * @param ec Set if an error occurs. * @return A UFormattedValue owned by the input object. * @stable ICU 64 */ U_CAPI const UFormattedValue* U_EXPORT2 udtitvfmt_resultAsValue(const UFormattedDateInterval* uresult, UErrorCode* ec); /** * Releases the UFormattedDateInterval created by udtitvfmt_openResult(). * * @param uresult The object to release. * @stable ICU 64 */ U_CAPI void U_EXPORT2 udtitvfmt_closeResult(UFormattedDateInterval* uresult); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Formats a date/time range using the conventions established for the * UDateIntervalFormat object. * @param formatter * The UDateIntervalFormat object specifying the format conventions. * @param fromDate * The starting point of the range. * @param toDate * The ending point of the range. * @param result * A pointer to a buffer to receive the formatted range. * @param resultCapacity * The maximum size of result. * @param position * A pointer to a UFieldPosition. On input, position->field is read. * On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, * if such a field exists. This parameter may be NULL, in which case * no field position data is returned. * There may be multiple instances of a given field type in an * interval format; in this case the position indices refer to the * first instance. * @param status * A pointer to a UErrorCode to receive any errors. * @return * The total buffer size needed; if greater than resultLength, the * output was truncated. * @stable ICU 4.8 */ U_CAPI int32_t U_EXPORT2 udtitvfmt_format(const UDateIntervalFormat* formatter, UDate fromDate, UDate toDate, UChar* result, int32_t resultCapacity, UFieldPosition* position, UErrorCode* status); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Formats a date/time range using the conventions established for the * UDateIntervalFormat object. * @param formatter * The UDateIntervalFormat object specifying the format conventions. * @param fromDate * The starting point of the range. * @param toDate * The ending point of the range. * @param result * The UFormattedDateInterval to contain the result of the * formatting operation. * @param status * A pointer to a UErrorCode to receive any errors. * @stable ICU 67 */ U_CAPI void U_EXPORT2 udtitvfmt_formatToResult( const UDateIntervalFormat* formatter, UDate fromDate, UDate toDate, UFormattedDateInterval* result, UErrorCode* status); /** * Formats a date/time range using the conventions established for the * UDateIntervalFormat object. * @param formatter * The UDateIntervalFormat object specifying the format conventions. * @param fromCalendar * The starting point of the range. * @param toCalendar * The ending point of the range. * @param result * The UFormattedDateInterval to contain the result of the * formatting operation. * @param status * A pointer to a UErrorCode to receive any errors. * @stable ICU 67 */ U_CAPI void U_EXPORT2 udtitvfmt_formatCalendarToResult( const UDateIntervalFormat* formatter, UCalendar* fromCalendar, UCalendar* toCalendar, UFormattedDateInterval* result, UErrorCode* status); /** * Set a particular UDisplayContext value in the formatter, such as * UDISPCTX_CAPITALIZATION_FOR_STANDALONE. This causes the formatted * result to be capitalized appropriately for the context in which * it is intended to be used, considering both the locale and the * type of field at the beginning of the formatted result. * @param formatter The formatter for which to set a UDisplayContext value. * @param value The UDisplayContext value to set. * @param status A pointer to an UErrorCode to receive any errors * @stable ICU 68 */ U_CAPI void U_EXPORT2 udtitvfmt_setContext(UDateIntervalFormat* formatter, UDisplayContext value, UErrorCode* status); /** * Get the formatter's UDisplayContext value for the specified UDisplayContextType, * such as UDISPCTX_TYPE_CAPITALIZATION. * @param formatter The formatter to query. * @param type The UDisplayContextType whose value to return * @param status A pointer to an UErrorCode to receive any errors * @return The UDisplayContextValue for the specified type. * @stable ICU 68 */ U_CAPI UDisplayContext U_EXPORT2 udtitvfmt_getContext(const UDateIntervalFormat* formatter, UDisplayContextType type, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // ugender.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2010-2013, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef UGENDER_H #define UGENDER_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: The purpose of this API is to compute the gender of a list as a * whole given the gender of each element. * */ /** * Genders * @stable ICU 50 */ enum UGender { /** * Male gender. * @stable ICU 50 */ UGENDER_MALE, /** * Female gender. * @stable ICU 50 */ UGENDER_FEMALE, /** * Neutral gender. * @stable ICU 50 */ UGENDER_OTHER }; /** * @stable ICU 50 */ typedef enum UGender UGender; struct UGenderInfo; /** * Opaque UGenderInfo object for use in C programs. * @stable ICU 50 */ typedef struct UGenderInfo UGenderInfo; /** * Opens a new UGenderInfo object given locale. * @param locale The locale for which the rules are desired. * @param status UErrorCode pointer * @return A UGenderInfo for the specified locale, or NULL if an error occurred. * @stable ICU 50 */ U_CAPI const UGenderInfo* U_EXPORT2 ugender_getInstance(const char *locale, UErrorCode *status); /** * Given a list, returns the gender of the list as a whole. * @param genderInfo pointer that ugender_getInstance returns. * @param genders the gender of each element in the list. * @param size the size of the list. * @param status A pointer to a UErrorCode to receive any errors. * @return The gender of the list. * @stable ICU 50 */ U_CAPI UGender U_EXPORT2 ugender_getListGender(const UGenderInfo* genderInfo, const UGender *genders, int32_t size, UErrorCode *status); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // ulistformatter.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2015-2016, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef ULISTFORMATTER_H #define ULISTFORMATTER_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Format a list in a locale-appropriate way. * * A UListFormatter is used to format a list of items in a locale-appropriate way, * using data from CLDR. * Example: Input data ["Alice", "Bob", "Charlie", "Delta"] will be formatted * as "Alice, Bob, Charlie, and Delta" in English. */ /** * Opaque UListFormatter object for use in C * @stable ICU 55 */ struct UListFormatter; typedef struct UListFormatter UListFormatter; /**< C typedef for struct UListFormatter. @stable ICU 55 */ #if (NTDDI_VERSION >= NTDDI_WIN10_CO) struct UFormattedList; /** * Opaque struct to contain the results of a UListFormatter operation. * @stable ICU 64 */ typedef struct UFormattedList UFormattedList; /** * FieldPosition and UFieldPosition selectors for format fields * defined by ListFormatter. * @stable ICU 63 */ typedef enum UListFormatterField { /** * The literal text in the result which came from the resources. * @stable ICU 63 */ ULISTFMT_LITERAL_FIELD, /** * The element text in the result which came from the input strings. * @stable ICU 63 */ ULISTFMT_ELEMENT_FIELD } UListFormatterField; /** * Type of meaning expressed by the list. * * @stable ICU 67 */ typedef enum UListFormatterType { /** * Conjunction formatting, e.g. "Alice, Bob, Charlie, and Delta". * * @stable ICU 67 */ ULISTFMT_TYPE_AND, /** * Disjunction (or alternative, or simply one of) formatting, e.g. * "Alice, Bob, Charlie, or Delta". * * @stable ICU 67 */ ULISTFMT_TYPE_OR, /** * Formatting of a list of values with units, e.g. "5 pounds, 12 ounces". * * @stable ICU 67 */ ULISTFMT_TYPE_UNITS } UListFormatterType; /** * Verbosity level of the list patterns. * * @stable ICU 67 */ typedef enum UListFormatterWidth { /** * Use list formatting with full words (no abbreviations) when possible. * * @stable ICU 67 */ ULISTFMT_WIDTH_WIDE, /** * Use list formatting of typical length. * @stable ICU 67 */ ULISTFMT_WIDTH_SHORT, /** * Use list formatting of the shortest possible length. * @stable ICU 67 */ ULISTFMT_WIDTH_NARROW, } UListFormatterWidth; #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Open a new UListFormatter object using the rules for a given locale. * The object will be initialized with AND type and WIDE width. * * @param locale * The locale whose rules should be used; may be NULL for * default locale. * @param status * A pointer to a standard ICU UErrorCode (input/output parameter). * Its input value must pass the U_SUCCESS() test, or else the * function returns immediately. The caller should check its output * value with U_FAILURE(), or use with function chaining (see User * Guide for details). * @return * A pointer to a UListFormatter object for the specified locale, * or NULL if an error occurred. * @stable ICU 55 */ U_CAPI UListFormatter* U_EXPORT2 ulistfmt_open(const char* locale, UErrorCode* status); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Open a new UListFormatter object appropriate for the given locale, list type, * and style. * * @param locale * The locale whose rules should be used; may be NULL for * default locale. * @param type * The type of list formatting to use. * @param width * The width of formatting to use. * @param status * A pointer to a standard ICU UErrorCode (input/output parameter). * Its input value must pass the U_SUCCESS() test, or else the * function returns immediately. The caller should check its output * value with U_FAILURE(), or use with function chaining (see User * Guide for details). * @return * A pointer to a UListFormatter object for the specified locale, * or NULL if an error occurred. * @stable ICU 67 */ U_CAPI UListFormatter* U_EXPORT2 ulistfmt_openForType(const char* locale, UListFormatterType type, UListFormatterWidth width, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Close a UListFormatter object. Once closed it may no longer be used. * @param listfmt * The UListFormatter object to close. * @stable ICU 55 */ U_CAPI void U_EXPORT2 ulistfmt_close(UListFormatter *listfmt); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Creates an object to hold the result of a UListFormatter * operation. The object can be used repeatedly; it is cleared whenever * passed to a format function. * * @param ec Set if an error occurs. * @return A pointer needing ownership. * @stable ICU 64 */ U_CAPI UFormattedList* U_EXPORT2 ulistfmt_openResult(UErrorCode* ec); /** * Returns a representation of a UFormattedList as a UFormattedValue, * which can be subsequently passed to any API requiring that type. * * The returned object is owned by the UFormattedList and is valid * only as long as the UFormattedList is present and unchanged in memory. * * You can think of this method as a cast between types. * * When calling ufmtval_nextPosition(): * The fields are returned from start to end. The special field category * UFIELD_CATEGORY_LIST_SPAN is used to indicate which argument * was inserted at the given position. The span category will * always occur before the corresponding instance of UFIELD_CATEGORY_LIST * in the ufmtval_nextPosition() iterator. * * @param uresult The object containing the formatted string. * @param ec Set if an error occurs. * @return A UFormattedValue owned by the input object. * @stable ICU 64 */ U_CAPI const UFormattedValue* U_EXPORT2 ulistfmt_resultAsValue(const UFormattedList* uresult, UErrorCode* ec); /** * Releases the UFormattedList created by ulistfmt_openResult(). * * @param uresult The object to release. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ulistfmt_closeResult(UFormattedList* uresult); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Formats a list of strings using the conventions established for the * UListFormatter object. * @param listfmt * The UListFormatter object specifying the list conventions. * @param strings * An array of pointers to UChar strings; the array length is * specified by stringCount. Must be non-NULL if stringCount > 0. * @param stringLengths * An array of string lengths corresponding to the strings[] * parameter; any individual length value may be negative to indicate * that the corresponding strings[] entry is 0-terminated, or * stringLengths itself may be NULL if all of the strings are * 0-terminated. If non-NULL, the stringLengths array must have * stringCount entries. * @param stringCount * the number of entries in strings[], and the number of entries * in the stringLengths array if it is not NULL. Must be >= 0. * @param result * A pointer to a buffer to receive the formatted list. * @param resultCapacity * The maximum size of result. * @param status * A pointer to a standard ICU UErrorCode (input/output parameter). * Its input value must pass the U_SUCCESS() test, or else the * function returns immediately. The caller should check its output * value with U_FAILURE(), or use with function chaining (see User * Guide for details). * @return * The total buffer size needed; if greater than resultLength, the * output was truncated. May be <=0 if unable to determine the * total buffer size needed (e.g. for illegal arguments). * @stable ICU 55 */ U_CAPI int32_t U_EXPORT2 ulistfmt_format(const UListFormatter* listfmt, const UChar* const strings[], const int32_t * stringLengths, int32_t stringCount, UChar* result, int32_t resultCapacity, UErrorCode* status); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Formats a list of strings to a UFormattedList, which exposes more * information than the string exported by ulistfmt_format(). * * @param listfmt * The UListFormatter object specifying the list conventions. * @param strings * An array of pointers to UChar strings; the array length is * specified by stringCount. Must be non-NULL if stringCount > 0. * @param stringLengths * An array of string lengths corresponding to the strings[] * parameter; any individual length value may be negative to indicate * that the corresponding strings[] entry is 0-terminated, or * stringLengths itself may be NULL if all of the strings are * 0-terminated. If non-NULL, the stringLengths array must have * stringCount entries. * @param stringCount * the number of entries in strings[], and the number of entries * in the stringLengths array if it is not NULL. Must be >= 0. * @param uresult * The object in which to store the result of the list formatting * operation. See ulistfmt_openResult(). * @param status * Error code set if an error occurred during formatting. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ulistfmt_formatStringsToResult( const UListFormatter* listfmt, const UChar* const strings[], const int32_t * stringLengths, int32_t stringCount, UFormattedList* uresult, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // ulocdata.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * * Copyright (C) 2003-2015, International Business Machines * * Corporation and others. All Rights Reserved. * * * ****************************************************************************** * file name: ulocdata.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2003Oct21 * created by: Ram Viswanadha */ #ifndef __ULOCDATA_H__ #define __ULOCDATA_H__ /** * \file * \brief C API: Provides access to locale data. */ /** Forward declaration of the ULocaleData structure. @stable ICU 3.6 */ struct ULocaleData; /** A locale data object. @stable ICU 3.6 */ typedef struct ULocaleData ULocaleData; /** The possible types of exemplar character sets. * @stable ICU 3.4 */ typedef enum ULocaleDataExemplarSetType { /** Basic set @stable ICU 3.4 */ ULOCDATA_ES_STANDARD=0, /** Auxiliary set @stable ICU 3.4 */ ULOCDATA_ES_AUXILIARY=1, /** Index Character set @stable ICU 4.8 */ ULOCDATA_ES_INDEX=2, /** Punctuation set @stable ICU 51 */ ULOCDATA_ES_PUNCTUATION=3, } ULocaleDataExemplarSetType; /** The possible types of delimiters. * @stable ICU 3.4 */ typedef enum ULocaleDataDelimiterType { /** Quotation start @stable ICU 3.4 */ ULOCDATA_QUOTATION_START = 0, /** Quotation end @stable ICU 3.4 */ ULOCDATA_QUOTATION_END = 1, /** Alternate quotation start @stable ICU 3.4 */ ULOCDATA_ALT_QUOTATION_START = 2, /** Alternate quotation end @stable ICU 3.4 */ ULOCDATA_ALT_QUOTATION_END = 3, } ULocaleDataDelimiterType; /** * Opens a locale data object for the given locale * * @param localeID Specifies the locale associated with this locale * data object. * @param status Pointer to error status code. * @stable ICU 3.4 */ U_CAPI ULocaleData* U_EXPORT2 ulocdata_open(const char *localeID, UErrorCode *status); /** * Closes a locale data object. * * @param uld The locale data object to close * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ulocdata_close(ULocaleData *uld); /** * Sets the "no Substitute" attribute of the locale data * object. If true, then any methods associated with the * locale data object will return null when there is no * data available for that method, given the locale ID * supplied to ulocdata_open(). * * @param uld The locale data object to set. * @param setting Value of the "no substitute" attribute. * @stable ICU 3.4 */ U_CAPI void U_EXPORT2 ulocdata_setNoSubstitute(ULocaleData *uld, UBool setting); /** * Retrieves the current "no Substitute" value of the locale data * object. If true, then any methods associated with the * locale data object will return null when there is no * data available for that method, given the locale ID * supplied to ulocdata_open(). * * @param uld Pointer to the The locale data object to set. * @return UBool Value of the "no substitute" attribute. * @stable ICU 3.4 */ U_CAPI UBool U_EXPORT2 ulocdata_getNoSubstitute(ULocaleData *uld); /** * Returns the set of exemplar characters for a locale. * * @param uld Pointer to the locale data object from which the * exemplar character set is to be retrieved. * @param fillIn Pointer to a USet object to receive the * exemplar character set for the given locale. Previous * contents of fillIn are lost. If fillIn is NULL, * then a new USet is created and returned. The caller * owns the result and must dispose of it by calling * uset_close. * @param options Bitmask for options to apply to the exemplar pattern. * Specify zero to retrieve the exemplar set as it is * defined in the locale data. Specify * USET_CASE_INSENSITIVE to retrieve a case-folded * exemplar set. See uset_applyPattern for a complete * list of valid options. The USET_IGNORE_SPACE bit is * always set, regardless of the value of 'options'. * @param extype Specifies the type of exemplar set to be retrieved. * @param status Pointer to an input-output error code value; * must not be NULL. Will be set to U_MISSING_RESOURCE_ERROR * if the requested data is not available. * @return USet* Either fillIn, or if fillIn is NULL, a pointer to * a newly-allocated USet that the user must close. * In case of error, NULL is returned. * @stable ICU 3.4 */ U_CAPI USet* U_EXPORT2 ulocdata_getExemplarSet(ULocaleData *uld, USet *fillIn, uint32_t options, ULocaleDataExemplarSetType extype, UErrorCode *status); /** * Returns one of the delimiter strings associated with a locale. * * @param uld Pointer to the locale data object from which the * delimiter string is to be retrieved. * @param type the type of delimiter to be retrieved. * @param result A pointer to a buffer to receive the result. * @param resultLength The maximum size of result. * @param status Pointer to an error code value * @return int32_t The total buffer size needed; if greater than resultLength, * the output was truncated. * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 ulocdata_getDelimiter(ULocaleData *uld, ULocaleDataDelimiterType type, UChar *result, int32_t resultLength, UErrorCode *status); /** * Enumeration for representing the measurement systems. * @stable ICU 2.8 */ typedef enum UMeasurementSystem { UMS_SI, /**< Measurement system specified by SI otherwise known as Metric system. @stable ICU 2.8 */ UMS_US, /**< Measurement system followed in the United States of America. @stable ICU 2.8 */ UMS_UK, /**< Mix of metric and imperial units used in Great Britain. @stable ICU 55 */ } UMeasurementSystem; /** * Returns the measurement system used in the locale specified by the localeID. * Please note that this API will change in ICU 3.6 and will use an ulocdata object. * * @param localeID The id of the locale for which the measurement system to be retrieved. * @param status Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return UMeasurementSystem the measurement system used in the locale. * @stable ICU 2.8 */ U_CAPI UMeasurementSystem U_EXPORT2 ulocdata_getMeasurementSystem(const char *localeID, UErrorCode *status); /** * Returns the element gives the normal business letter size, and customary units. * The units for the numbers are always in milli-meters. * For US since 8.5 and 11 do not yeild an integral value when converted to milli-meters, * the values are rounded off. * So for A4 size paper the height and width are 297 mm and 210 mm repectively, * and for US letter size the height and width are 279 mm and 216 mm respectively. * Please note that this API will change in ICU 3.6 and will use an ulocdata object. * * @param localeID The id of the locale for which the paper size information to be retrieved. * @param height A pointer to int to recieve the height information. * @param width A pointer to int to recieve the width information. * @param status Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @stable ICU 2.8 */ U_CAPI void U_EXPORT2 ulocdata_getPaperSize(const char *localeID, int32_t *height, int32_t *width, UErrorCode *status); /** * Return the current CLDR version used by the library. * @param versionArray fillin that will recieve the version number * @param status error code - could be U_MISSING_RESOURCE_ERROR if the version was not found. * @stable ICU 4.2 */ U_CAPI void U_EXPORT2 ulocdata_getCLDRVersion(UVersionInfo versionArray, UErrorCode *status); /** * Returns locale display pattern associated with a locale. * * @param uld Pointer to the locale data object from which the * exemplar character set is to be retrieved. * @param pattern locale display pattern for locale. * @param patternCapacity the size of the buffer to store the locale display * pattern with. * @param status Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return the actual buffer size needed for localeDisplayPattern. If it's greater * than patternCapacity, the returned pattern will be truncated. * * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 ulocdata_getLocaleDisplayPattern(ULocaleData *uld, UChar *pattern, int32_t patternCapacity, UErrorCode *status); /** * Returns locale separator associated with a locale. * * @param uld Pointer to the locale data object from which the * exemplar character set is to be retrieved. * @param separator locale separator for locale. * @param separatorCapacity the size of the buffer to store the locale * separator with. * @param status Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return the actual buffer size needed for localeSeparator. If it's greater * than separatorCapacity, the returned separator will be truncated. * * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 ulocdata_getLocaleSeparator(ULocaleData *uld, UChar *separator, int32_t separatorCapacity, UErrorCode *status); #endif // umsg.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /******************************************************************** * COPYRIGHT: * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. * Copyright (C) 2010 , Yahoo! Inc. ******************************************************************** * * file name: umsg.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * Change history: * * 08/5/2001 Ram Added C wrappers for C++ API. ********************************************************************/ #ifndef UMSG_H #define UMSG_H #if !UCONFIG_NO_FORMATTING #include /** * \file * \brief C API: MessageFormat * *

MessageFormat C API

* *

MessageFormat prepares strings for display to users, * with optional arguments (variables/placeholders). * The arguments can occur in any order, which is necessary for translation * into languages with different grammars. * *

The opaque UMessageFormat type is a thin C wrapper around * a C++ MessageFormat. It is constructed from a pattern string * with arguments in {curly braces} which will be replaced by formatted values. * *

Currently, the C API supports only numbered arguments. * *

For details about the pattern syntax and behavior, * especially about the ASCII apostrophe vs. the * real apostrophe (single quote) character \htmlonly’\endhtmlonly (U+2019), * see the C++ MessageFormat class documentation. * *

Here are some examples of C API usage: * Example 1: *

 * \code
 *     UChar *result, *tzID, *str;
 *     UChar pattern[100];
 *     int32_t resultLengthOut, resultlength;
 *     UCalendar *cal;
 *     UDate d1;
 *     UDateFormat *def1;
 *     UErrorCode status = U_ZERO_ERROR;
 *
 *     str=(UChar*)malloc(sizeof(UChar) * (strlen("disturbance in force") +1));
 *     u_uastrcpy(str, "disturbance in force");
 *     tzID=(UChar*)malloc(sizeof(UChar) * 4);
 *     u_uastrcpy(tzID, "PST");
 *     cal=ucal_open(tzID, u_strlen(tzID), "en_US", UCAL_TRADITIONAL, &status);
 *     ucal_setDateTime(cal, 1999, UCAL_MARCH, 18, 0, 0, 0, &status);
 *     d1=ucal_getMillis(cal, &status);
 *     u_uastrcpy(pattern, "On {0, date, long}, there was a {1} on planet {2,number,integer}");
 *     resultlength=0;
 *     resultLengthOut=u_formatMessage( "en_US", pattern, u_strlen(pattern), NULL, resultlength, &status, d1, str, 7);
 *     if(status==U_BUFFER_OVERFLOW_ERROR){
 *         status=U_ZERO_ERROR;
 *         resultlength=resultLengthOut+1;
 *         result=(UChar*)realloc(result, sizeof(UChar) * resultlength);
 *         u_formatMessage( "en_US", pattern, u_strlen(pattern), result, resultlength, &status, d1, str, 7);
 *     }
 *     printf("%s\n", austrdup(result) );//austrdup( a function used to convert UChar* to char*)
 *     //output>: "On March 18, 1999, there was a disturbance in force on planet 7
 * \endcode
 * 
* Typically, the message format will come from resources, and the * arguments will be dynamically set at runtime. *

* Example 2: *

 * \code
 *     UChar* str;
 *     UErrorCode status = U_ZERO_ERROR;
 *     UChar *result;
 *     UChar pattern[100];
 *     int32_t resultlength, resultLengthOut, i;
 *     double testArgs= { 100.0, 1.0, 0.0};
 *
 *     str=(UChar*)malloc(sizeof(UChar) * 10);
 *     u_uastrcpy(str, "MyDisk");
 *     u_uastrcpy(pattern, "The disk {1} contains {0,choice,0#no files|1#one file|1<{0,number,integer} files}");
 *     for(i=0; i<3; i++){
 *       resultlength=0; 
 *       resultLengthOut=u_formatMessage( "en_US", pattern, u_strlen(pattern), NULL, resultlength, &status, testArgs[i], str); 
 *       if(status==U_BUFFER_OVERFLOW_ERROR){
 *         status=U_ZERO_ERROR;
 *         resultlength=resultLengthOut+1;
 *         result=(UChar*)malloc(sizeof(UChar) * resultlength);
 *         u_formatMessage( "en_US", pattern, u_strlen(pattern), result, resultlength, &status, testArgs[i], str);
 *       }
 *       printf("%s\n", austrdup(result) );  //austrdup( a function used to convert UChar* to char*)
 *       free(result);
 *     }
 *     // output, with different testArgs:
 *     // output: The disk "MyDisk" contains 100 files.
 *     // output: The disk "MyDisk" contains one file.
 *     // output: The disk "MyDisk" contains no files.
 * \endcode
 *  
* * * Example 3: *
 * \code
 * UChar* str;
 * UChar* str1;
 * UErrorCode status = U_ZERO_ERROR;
 * UChar *result;
 * UChar pattern[100];
 * UChar expected[100];
 * int32_t resultlength,resultLengthOut;

 * str=(UChar*)malloc(sizeof(UChar) * 25);
 * u_uastrcpy(str, "Kirti");
 * str1=(UChar*)malloc(sizeof(UChar) * 25);
 * u_uastrcpy(str1, "female");
 * log_verbose("Testing message format with Select test #1\n:");
 * u_uastrcpy(pattern, "{0} est {1, select, female {all\\u00E9e} other {all\\u00E9}} \\u00E0 Paris.");
 * u_uastrcpy(expected, "Kirti est all\\u00E9e \\u00E0 Paris.");
 * resultlength=0;
 * resultLengthOut=u_formatMessage( "fr", pattern, u_strlen(pattern), NULL, resultlength, &status, str , str1);
 * if(status==U_BUFFER_OVERFLOW_ERROR)
 *  {
 *      status=U_ZERO_ERROR;
 *      resultlength=resultLengthOut+1;
 *      result=(UChar*)malloc(sizeof(UChar) * resultlength);
 *      u_formatMessage( "fr", pattern, u_strlen(pattern), result, resultlength, &status, str , str1);
 *      if(u_strcmp(result, expected)==0)
 *          log_verbose("PASS: MessagFormat successful on Select test#1\n");
 *      else{
 *          log_err("FAIL: Error in MessageFormat on Select test#1\n GOT %s EXPECTED %s\n", austrdup(result),
 *          austrdup(expected) );
 *      }
 *      free(result);
 * }
 * \endcode
 *  
*/ /** * Format a message for a locale. * This function may perform re-ordering of the arguments depending on the * locale. For all numeric arguments, double is assumed unless the type is * explicitly integer. All choice format arguments must be of type double. * @param locale The locale for which the message will be formatted * @param pattern The pattern specifying the message's format * @param patternLength The length of pattern * @param result A pointer to a buffer to receive the formatted message. * @param resultLength The maximum size of result. * @param status A pointer to an UErrorCode to receive any errors * @param ... A variable-length argument list containing the arguments specified * in pattern. * @return The total buffer size needed; if greater than resultLength, the * output was truncated. * @see u_parseMessage * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_formatMessage(const char *locale, const UChar *pattern, int32_t patternLength, UChar *result, int32_t resultLength, UErrorCode *status, ...); /** * Format a message for a locale. * This function may perform re-ordering of the arguments depending on the * locale. For all numeric arguments, double is assumed unless the type is * explicitly integer. All choice format arguments must be of type double. * @param locale The locale for which the message will be formatted * @param pattern The pattern specifying the message's format * @param patternLength The length of pattern * @param result A pointer to a buffer to receive the formatted message. * @param resultLength The maximum size of result. * @param ap A variable-length argument list containing the arguments specified * @param status A pointer to an UErrorCode to receive any errors * in pattern. * @return The total buffer size needed; if greater than resultLength, the * output was truncated. * @see u_parseMessage * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_vformatMessage( const char *locale, const UChar *pattern, int32_t patternLength, UChar *result, int32_t resultLength, va_list ap, UErrorCode *status); /** * Parse a message. * For numeric arguments, this function will always use doubles. Integer types * should not be passed. * This function is not able to parse all output from {@link #u_formatMessage }. * @param locale The locale for which the message is formatted * @param pattern The pattern specifying the message's format * @param patternLength The length of pattern * @param source The text to parse. * @param sourceLength The length of source, or -1 if null-terminated. * @param status A pointer to an UErrorCode to receive any errors * @param ... A variable-length argument list containing the arguments * specified in pattern. * @see u_formatMessage * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 u_parseMessage( const char *locale, const UChar *pattern, int32_t patternLength, const UChar *source, int32_t sourceLength, UErrorCode *status, ...); /** * Parse a message. * For numeric arguments, this function will always use doubles. Integer types * should not be passed. * This function is not able to parse all output from {@link #u_formatMessage }. * @param locale The locale for which the message is formatted * @param pattern The pattern specifying the message's format * @param patternLength The length of pattern * @param source The text to parse. * @param sourceLength The length of source, or -1 if null-terminated. * @param ap A variable-length argument list containing the arguments * @param status A pointer to an UErrorCode to receive any errors * specified in pattern. * @see u_formatMessage * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 u_vparseMessage(const char *locale, const UChar *pattern, int32_t patternLength, const UChar *source, int32_t sourceLength, va_list ap, UErrorCode *status); /** * Format a message for a locale. * This function may perform re-ordering of the arguments depending on the * locale. For all numeric arguments, double is assumed unless the type is * explicitly integer. All choice format arguments must be of type double. * @param locale The locale for which the message will be formatted * @param pattern The pattern specifying the message's format * @param patternLength The length of pattern * @param result A pointer to a buffer to receive the formatted message. * @param resultLength The maximum size of result. * @param status A pointer to an UErrorCode to receive any errors * @param ... A variable-length argument list containing the arguments specified * in pattern. * @param parseError A pointer to UParseError to receive information about errors * occurred during parsing. * @return The total buffer size needed; if greater than resultLength, the * output was truncated. * @see u_parseMessage * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_formatMessageWithError( const char *locale, const UChar *pattern, int32_t patternLength, UChar *result, int32_t resultLength, UParseError *parseError, UErrorCode *status, ...); /** * Format a message for a locale. * This function may perform re-ordering of the arguments depending on the * locale. For all numeric arguments, double is assumed unless the type is * explicitly integer. All choice format arguments must be of type double. * @param locale The locale for which the message will be formatted * @param pattern The pattern specifying the message's format * @param patternLength The length of pattern * @param result A pointer to a buffer to receive the formatted message. * @param resultLength The maximum size of result. * @param parseError A pointer to UParseError to receive information about errors * occurred during parsing. * @param ap A variable-length argument list containing the arguments specified * @param status A pointer to an UErrorCode to receive any errors * in pattern. * @return The total buffer size needed; if greater than resultLength, the * output was truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_vformatMessageWithError( const char *locale, const UChar *pattern, int32_t patternLength, UChar *result, int32_t resultLength, UParseError* parseError, va_list ap, UErrorCode *status); /** * Parse a message. * For numeric arguments, this function will always use doubles. Integer types * should not be passed. * This function is not able to parse all output from {@link #u_formatMessage }. * @param locale The locale for which the message is formatted * @param pattern The pattern specifying the message's format * @param patternLength The length of pattern * @param source The text to parse. * @param sourceLength The length of source, or -1 if null-terminated. * @param parseError A pointer to UParseError to receive information about errors * occurred during parsing. * @param status A pointer to an UErrorCode to receive any errors * @param ... A variable-length argument list containing the arguments * specified in pattern. * @see u_formatMessage * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 u_parseMessageWithError(const char *locale, const UChar *pattern, int32_t patternLength, const UChar *source, int32_t sourceLength, UParseError *parseError, UErrorCode *status, ...); /** * Parse a message. * For numeric arguments, this function will always use doubles. Integer types * should not be passed. * This function is not able to parse all output from {@link #u_formatMessage }. * @param locale The locale for which the message is formatted * @param pattern The pattern specifying the message's format * @param patternLength The length of pattern * @param source The text to parse. * @param sourceLength The length of source, or -1 if null-terminated. * @param ap A variable-length argument list containing the arguments * @param parseError A pointer to UParseError to receive information about errors * occurred during parsing. * @param status A pointer to an UErrorCode to receive any errors * specified in pattern. * @see u_formatMessage * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 u_vparseMessageWithError(const char *locale, const UChar *pattern, int32_t patternLength, const UChar *source, int32_t sourceLength, va_list ap, UParseError *parseError, UErrorCode* status); /*----------------------- New experimental API --------------------------- */ /** * The message format object * @stable ICU 2.0 */ typedef void* UMessageFormat; /** * Open a message formatter with given pattern and for the given locale. * @param pattern A pattern specifying the format to use. * @param patternLength Length of the pattern to use * @param locale The locale for which the messages are formatted. * @param parseError A pointer to UParseError struct to receive any errors * occurred during parsing. Can be NULL. * @param status A pointer to an UErrorCode to receive any errors. * @return A pointer to a UMessageFormat to use for formatting * messages, or 0 if an error occurred. * @stable ICU 2.0 */ U_CAPI UMessageFormat* U_EXPORT2 umsg_open( const UChar *pattern, int32_t patternLength, const char *locale, UParseError *parseError, UErrorCode *status); /** * Close a UMessageFormat. * Once closed, a UMessageFormat may no longer be used. * @param format The formatter to close. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 umsg_close(UMessageFormat* format); /** * Open a copy of a UMessageFormat. * This function performs a deep copy. * @param fmt The formatter to copy * @param status A pointer to an UErrorCode to receive any errors. * @return A pointer to a UDateFormat identical to fmt. * @stable ICU 2.0 */ U_CAPI UMessageFormat U_EXPORT2 umsg_clone(const UMessageFormat *fmt, UErrorCode *status); /** * Sets the locale. This locale is used for fetching default number or date * format information. * @param fmt The formatter to set * @param locale The locale the formatter should use. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 umsg_setLocale(UMessageFormat *fmt, const char* locale); /** * Gets the locale. This locale is used for fetching default number or date * format information. * @param fmt The formatter to querry * @return the locale. * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 umsg_getLocale(const UMessageFormat *fmt); /** * Sets the pattern. * @param fmt The formatter to use * @param pattern The pattern to be applied. * @param patternLength Length of the pattern to use * @param parseError Struct to receive information on position * of error if an error is encountered.Can be NULL. * @param status Output param set to success/failure code on * exit. If the pattern is invalid, this will be * set to a failure result. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 umsg_applyPattern( UMessageFormat *fmt, const UChar* pattern, int32_t patternLength, UParseError* parseError, UErrorCode* status); /** * Gets the pattern. * @param fmt The formatter to use * @param result A pointer to a buffer to receive the pattern. * @param resultLength The maximum size of result. * @param status Output param set to success/failure code on * exit. If the pattern is invalid, this will be * set to a failure result. * @return the pattern of the format * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 umsg_toPattern(const UMessageFormat *fmt, UChar* result, int32_t resultLength, UErrorCode* status); /** * Format a message for a locale. * This function may perform re-ordering of the arguments depending on the * locale. For all numeric arguments, double is assumed unless the type is * explicitly integer. All choice format arguments must be of type double. * @param fmt The formatter to use * @param result A pointer to a buffer to receive the formatted message. * @param resultLength The maximum size of result. * @param status A pointer to an UErrorCode to receive any errors * @param ... A variable-length argument list containing the arguments * specified in pattern. * @return The total buffer size needed; if greater than resultLength, * the output was truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 umsg_format( const UMessageFormat *fmt, UChar *result, int32_t resultLength, UErrorCode *status, ...); /** * Format a message for a locale. * This function may perform re-ordering of the arguments depending on the * locale. For all numeric arguments, double is assumed unless the type is * explicitly integer. All choice format arguments must be of type double. * @param fmt The formatter to use * @param result A pointer to a buffer to receive the formatted message. * @param resultLength The maximum size of result. * @param ap A variable-length argument list containing the arguments * @param status A pointer to an UErrorCode to receive any errors * specified in pattern. * @return The total buffer size needed; if greater than resultLength, * the output was truncated. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 umsg_vformat( const UMessageFormat *fmt, UChar *result, int32_t resultLength, va_list ap, UErrorCode *status); /** * Parse a message. * For numeric arguments, this function will always use doubles. Integer types * should not be passed. * This function is not able to parse all output from {@link #umsg_format }. * @param fmt The formatter to use * @param source The text to parse. * @param sourceLength The length of source, or -1 if null-terminated. * @param count Output param to receive number of elements returned. * @param status A pointer to an UErrorCode to receive any errors * @param ... A variable-length argument list containing the arguments * specified in pattern. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 umsg_parse( const UMessageFormat *fmt, const UChar *source, int32_t sourceLength, int32_t *count, UErrorCode *status, ...); /** * Parse a message. * For numeric arguments, this function will always use doubles. Integer types * should not be passed. * This function is not able to parse all output from {@link #umsg_format }. * @param fmt The formatter to use * @param source The text to parse. * @param sourceLength The length of source, or -1 if null-terminated. * @param count Output param to receive number of elements returned. * @param ap A variable-length argument list containing the arguments * @param status A pointer to an UErrorCode to receive any errors * specified in pattern. * @see u_formatMessage * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 umsg_vparse(const UMessageFormat *fmt, const UChar *source, int32_t sourceLength, int32_t *count, va_list ap, UErrorCode *status); /** * Convert an 'apostrophe-friendly' pattern into a standard * pattern. Standard patterns treat all apostrophes as * quotes, which is problematic in some languages, e.g. * French, where apostrophe is commonly used. This utility * assumes that only an unpaired apostrophe immediately before * a brace is a true quote. Other unpaired apostrophes are paired, * and the resulting standard pattern string is returned. * *

Note it is not guaranteed that the returned pattern * is indeed a valid pattern. The only effect is to convert * between patterns having different quoting semantics. * * @param pattern the 'apostrophe-friendly' patttern to convert * @param patternLength the length of pattern, or -1 if unknown and pattern is null-terminated * @param dest the buffer for the result, or NULL if preflight only * @param destCapacity the length of the buffer, or 0 if preflighting * @param ec the error code * @return the length of the resulting text, not including trailing null * if buffer has room for the trailing null, it is provided, otherwise * not * @stable ICU 3.4 */ U_CAPI int32_t U_EXPORT2 umsg_autoQuoteApostrophe(const UChar* pattern, int32_t patternLength, UChar* dest, int32_t destCapacity, UErrorCode* ec); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // unirepl.h // No supported content // unum.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2015, International Business Machines Corporation and others. * All Rights Reserved. * Modification History: * * Date Name Description * 06/24/99 helena Integrated Alan's NF enhancements and Java2 bug fixes ******************************************************************************* */ #ifndef _UNUM #define _UNUM #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Compatibility APIs for number formatting. * *

Number Format C API

* *

IMPORTANT: New users with are strongly encouraged to * see if unumberformatter.h fits their use case. Although not deprecated, * this header is provided for backwards compatibility only. * * Number Format C API Provides functions for * formatting and parsing a number. Also provides methods for * determining which locales have number formats, and what their names * are. *

* UNumberFormat helps you to format and parse numbers for any locale. * Your code can be completely independent of the locale conventions * for decimal points, thousands-separators, or even the particular * decimal digits used, or whether the number format is even decimal. * There are different number format styles like decimal, currency, * percent and spellout. *

* To format a number for the current Locale, use one of the static * factory methods: *

 * \code
 *    UChar myString[20];
 *    double myNumber = 7.0;
 *    UErrorCode status = U_ZERO_ERROR;
 *    UNumberFormat* nf = unum_open(UNUM_DEFAULT, NULL, -1, NULL, NULL, &status);
 *    unum_formatDouble(nf, myNumber, myString, 20, NULL, &status);
 *    printf(" Example 1: %s\n", austrdup(myString) ); //austrdup( a function used to convert UChar* to char*)
 * \endcode
 * 
* If you are formatting multiple numbers, it is more efficient to get * the format and use it multiple times so that the system doesn't * have to fetch the information about the local language and country * conventions multiple times. *
 * \code
 * uint32_t i, resultlength, reslenneeded;
 * UErrorCode status = U_ZERO_ERROR;
 * UFieldPosition pos;
 * uint32_t a[] = { 123, 3333, -1234567 };
 * const uint32_t a_len = sizeof(a) / sizeof(a[0]);
 * UNumberFormat* nf;
 * UChar* result = NULL;
 *
 * nf = unum_open(UNUM_DEFAULT, NULL, -1, NULL, NULL, &status);
 * for (i = 0; i < a_len; i++) {
 *    resultlength=0;
 *    reslenneeded=unum_format(nf, a[i], NULL, resultlength, &pos, &status);
 *    result = NULL;
 *    if(status==U_BUFFER_OVERFLOW_ERROR){
 *       status=U_ZERO_ERROR;
 *       resultlength=reslenneeded+1;
 *       result=(UChar*)malloc(sizeof(UChar) * resultlength);
 *       unum_format(nf, a[i], result, resultlength, &pos, &status);
 *    }
 *    printf( " Example 2: %s\n", austrdup(result));
 *    free(result);
 * }
 * \endcode
 * 
* To format a number for a different Locale, specify it in the * call to unum_open(). *
 * \code
 *     UNumberFormat* nf = unum_open(UNUM_DEFAULT, NULL, -1, "fr_FR", NULL, &success)
 * \endcode
 * 
* You can use a NumberFormat API unum_parse() to parse. *
 * \code
 *    UErrorCode status = U_ZERO_ERROR;
 *    int32_t pos=0;
 *    int32_t num;
 *    num = unum_parse(nf, str, u_strlen(str), &pos, &status);
 * \endcode
 * 
* Use UNUM_DECIMAL to get the normal number format for that country. * There are other static options available. Use UNUM_CURRENCY * to get the currency number format for that country. Use UNUM_PERCENT * to get a format for displaying percentages. With this format, a * fraction from 0.53 is displayed as 53%. *

* Use a pattern to create either a DecimalFormat or a RuleBasedNumberFormat * formatter. The pattern must conform to the syntax defined for those * formatters. *

* You can also control the display of numbers with such function as * unum_getAttributes() and unum_setAttributes(), which let you set the * minimum fraction digits, grouping, etc. * @see UNumberFormatAttributes for more details *

* You can also use forms of the parse and format methods with * ParsePosition and UFieldPosition to allow you to: *

    *
  • (a) progressively parse through pieces of a string. *
  • (b) align the decimal point and other areas. *
*

* It is also possible to change or set the symbols used for a particular * locale like the currency symbol, the grouping separator , monetary separator * etc by making use of functions unum_setSymbols() and unum_getSymbols(). */ /** A number formatter. * For usage in C programs. * @stable ICU 2.0 */ typedef void* UNumberFormat; /** The possible number format styles. * @stable ICU 2.0 */ typedef enum UNumberFormatStyle { /** * Decimal format defined by a pattern string. * @stable ICU 3.0 */ UNUM_PATTERN_DECIMAL=0, /** * Decimal format ("normal" style). * @stable ICU 2.0 */ UNUM_DECIMAL=1, /** * Currency format (generic). * Defaults to UNUM_CURRENCY_STANDARD style * (using currency symbol, e.g., "$1.00", with non-accounting * style for negative values e.g. using minus sign). * The specific style may be specified using the -cf- locale key. * @stable ICU 2.0 */ UNUM_CURRENCY=2, /** * Percent format * @stable ICU 2.0 */ UNUM_PERCENT=3, /** * Scientific format * @stable ICU 2.1 */ UNUM_SCIENTIFIC=4, /** * Spellout rule-based format. The default ruleset can be specified/changed using * unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets * can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS. * @stable ICU 2.0 */ UNUM_SPELLOUT=5, /** * Ordinal rule-based format . The default ruleset can be specified/changed using * unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets * can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS. * @stable ICU 3.0 */ UNUM_ORDINAL=6, /** * Duration rule-based format * @stable ICU 3.0 */ UNUM_DURATION=7, /** * Numbering system rule-based format * @stable ICU 4.2 */ UNUM_NUMBERING_SYSTEM=8, /** * Rule-based format defined by a pattern string. * @stable ICU 3.0 */ UNUM_PATTERN_RULEBASED=9, /** * Currency format with an ISO currency code, e.g., "USD1.00". * @stable ICU 4.8 */ UNUM_CURRENCY_ISO=10, /** * Currency format with a pluralized currency name, * e.g., "1.00 US dollar" and "3.00 US dollars". * @stable ICU 4.8 */ UNUM_CURRENCY_PLURAL=11, /** * Currency format for accounting, e.g., "($3.00)" for * negative currency amount instead of "-$3.00" ({@link #UNUM_CURRENCY}). * Overrides any style specified using -cf- key in locale. * @stable ICU 53 */ UNUM_CURRENCY_ACCOUNTING=12, /** * Currency format with a currency symbol given CASH usage, e.g., * "NT$3" instead of "NT$3.23". * @stable ICU 54 */ UNUM_CASH_CURRENCY=13, /** * Decimal format expressed using compact notation * (short form, corresponds to UNumberCompactStyle=UNUM_SHORT) * e.g. "23K", "45B" * @stable ICU 56 */ UNUM_DECIMAL_COMPACT_SHORT=14, /** * Decimal format expressed using compact notation * (long form, corresponds to UNumberCompactStyle=UNUM_LONG) * e.g. "23 thousand", "45 billion" * @stable ICU 56 */ UNUM_DECIMAL_COMPACT_LONG=15, /** * Currency format with a currency symbol, e.g., "$1.00", * using non-accounting style for negative values (e.g. minus sign). * Overrides any style specified using -cf- key in locale. * @stable ICU 56 */ UNUM_CURRENCY_STANDARD=16, /** * Default format * @stable ICU 2.0 */ UNUM_DEFAULT = UNUM_DECIMAL, /** * Alias for UNUM_PATTERN_DECIMAL * @stable ICU 3.0 */ UNUM_IGNORE = UNUM_PATTERN_DECIMAL } UNumberFormatStyle; /** The possible number format rounding modes. * *

* For more detail on rounding modes, see: * https://unicode-org.github.io/icu/userguide/format_parse/numbers/rounding-modes * * @stable ICU 2.0 */ typedef enum UNumberFormatRoundingMode { UNUM_ROUND_CEILING, UNUM_ROUND_FLOOR, UNUM_ROUND_DOWN, UNUM_ROUND_UP, /** * Half-even rounding * @stable, ICU 3.8 */ UNUM_ROUND_HALFEVEN, UNUM_ROUND_HALFDOWN = UNUM_ROUND_HALFEVEN + 1, UNUM_ROUND_HALFUP, /** * ROUND_UNNECESSARY reports an error if formatted result is not exact. * @stable ICU 4.8 */ UNUM_ROUND_UNNECESSARY, #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Rounds ties toward the odd number. * @stable ICU 69 */ UNUM_ROUND_HALF_ODD, /** * Rounds ties toward +∞. * @stable ICU 69 */ UNUM_ROUND_HALF_CEILING, /** * Rounds ties toward -∞. * @stable ICU 69 */ UNUM_ROUND_HALF_FLOOR, #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) } UNumberFormatRoundingMode; /** The possible number format pad positions. * @stable ICU 2.0 */ typedef enum UNumberFormatPadPosition { UNUM_PAD_BEFORE_PREFIX, UNUM_PAD_AFTER_PREFIX, UNUM_PAD_BEFORE_SUFFIX, UNUM_PAD_AFTER_SUFFIX } UNumberFormatPadPosition; /** * Constants for specifying short or long format. * @stable ICU 51 */ typedef enum UNumberCompactStyle { /** @stable ICU 51 */ UNUM_SHORT, /** @stable ICU 51 */ UNUM_LONG /** @stable ICU 51 */ } UNumberCompactStyle; /** * Constants for specifying currency spacing * @stable ICU 4.8 */ enum UCurrencySpacing { /** @stable ICU 4.8 */ UNUM_CURRENCY_MATCH, /** @stable ICU 4.8 */ UNUM_CURRENCY_SURROUNDING_MATCH, /** @stable ICU 4.8 */ UNUM_CURRENCY_INSERT, /* Do not conditionalize the following with #ifndef U_HIDE_DEPRECATED_API, * it is needed for layout of DecimalFormatSymbols object. */ #ifndef U_FORCE_HIDE_DEPRECATED_API /** * One more than the highest normal UCurrencySpacing value. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ UNUM_CURRENCY_SPACING_COUNT #endif // U_FORCE_HIDE_DEPRECATED_API }; typedef enum UCurrencySpacing UCurrencySpacing; /**< @stable ICU 4.8 */ /** * FieldPosition and UFieldPosition selectors for format fields * defined by NumberFormat and UNumberFormat. * @stable ICU 49 */ typedef enum UNumberFormatFields { /** @stable ICU 49 */ UNUM_INTEGER_FIELD, /** @stable ICU 49 */ UNUM_FRACTION_FIELD, /** @stable ICU 49 */ UNUM_DECIMAL_SEPARATOR_FIELD, /** @stable ICU 49 */ UNUM_EXPONENT_SYMBOL_FIELD, /** @stable ICU 49 */ UNUM_EXPONENT_SIGN_FIELD, /** @stable ICU 49 */ UNUM_EXPONENT_FIELD, /** @stable ICU 49 */ UNUM_GROUPING_SEPARATOR_FIELD, /** @stable ICU 49 */ UNUM_CURRENCY_FIELD, /** @stable ICU 49 */ UNUM_PERCENT_FIELD, /** @stable ICU 49 */ UNUM_PERMILL_FIELD, /** @stable ICU 49 */ UNUM_SIGN_FIELD, #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** @stable ICU 64 */ UNUM_MEASURE_UNIT_FIELD, /** @stable ICU 64 */ UNUM_COMPACT_FIELD, #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) } UNumberFormatFields; #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Selectors with special numeric values to use locale default minimum grouping * digits for the DecimalFormat/UNumberFormat setMinimumGroupingDigits method. * Do not use these constants with the [U]NumberFormatter API. * * @stable ICU 68 */ typedef enum UNumberFormatMinimumGroupingDigits { /** * Display grouping using the default strategy for all locales. * @stable ICU 68 */ UNUM_MINIMUM_GROUPING_DIGITS_AUTO = -2, /** * Display grouping using locale defaults, except do not show grouping on * values smaller than 10000 (such that there is a minimum of two digits * before the first separator). * @stable ICU 68 */ UNUM_MINIMUM_GROUPING_DIGITS_MIN2 = -3, } UNumberFormatMinimumGroupingDigits; #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Create and return a new UNumberFormat for formatting and parsing * numbers. A UNumberFormat may be used to format numbers by calling * {@link #unum_format }, and to parse numbers by calling {@link #unum_parse }. * The caller must call {@link #unum_close } when done to release resources * used by this object. * @param style The type of number format to open: one of * UNUM_DECIMAL, UNUM_CURRENCY, UNUM_PERCENT, UNUM_SCIENTIFIC, * UNUM_CURRENCY_ISO, UNUM_CURRENCY_PLURAL, UNUM_SPELLOUT, * UNUM_ORDINAL, UNUM_DURATION, UNUM_NUMBERING_SYSTEM, * UNUM_PATTERN_DECIMAL, UNUM_PATTERN_RULEBASED, or UNUM_DEFAULT. * If UNUM_PATTERN_DECIMAL or UNUM_PATTERN_RULEBASED is passed then the * number format is opened using the given pattern, which must conform * to the syntax described in DecimalFormat or RuleBasedNumberFormat, * respectively. * *

NOTE:: New users with are strongly encouraged to * use unumf_openForSkeletonAndLocale instead of unum_open. * * @param pattern A pattern specifying the format to use. * This parameter is ignored unless the style is * UNUM_PATTERN_DECIMAL or UNUM_PATTERN_RULEBASED. * @param patternLength The number of characters in the pattern, or -1 * if null-terminated. This parameter is ignored unless the style is * UNUM_PATTERN. * @param locale A locale identifier to use to determine formatting * and parsing conventions, or NULL to use the default locale. * @param parseErr A pointer to a UParseError struct to receive the * details of any parsing errors, or NULL if no parsing error details * are desired. * @param status A pointer to an input-output UErrorCode. * @return A pointer to a newly created UNumberFormat, or NULL if an * error occurred. * @see unum_close * @see DecimalFormat * @stable ICU 2.0 */ U_CAPI UNumberFormat* U_EXPORT2 unum_open( UNumberFormatStyle style, const UChar* pattern, int32_t patternLength, const char* locale, UParseError* parseErr, UErrorCode* status); /** * Close a UNumberFormat. * Once closed, a UNumberFormat may no longer be used. * @param fmt The formatter to close. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_close(UNumberFormat* fmt); /** * Open a copy of a UNumberFormat. * This function performs a deep copy. * @param fmt The format to copy * @param status A pointer to an UErrorCode to receive any errors. * @return A pointer to a UNumberFormat identical to fmt. * @stable ICU 2.0 */ U_CAPI UNumberFormat* U_EXPORT2 unum_clone(const UNumberFormat *fmt, UErrorCode *status); /** * Format an integer using a UNumberFormat. * The integer will be formatted according to the UNumberFormat's locale. * @param fmt The formatter to use. * @param number The number to format. * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength The maximum size of result. * @param pos A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case no field * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_formatInt64 * @see unum_formatDouble * @see unum_parse * @see unum_parseInt64 * @see unum_parseDouble * @see UFieldPosition * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_format( const UNumberFormat* fmt, int32_t number, UChar* result, int32_t resultLength, UFieldPosition *pos, UErrorCode* status); /** * Format an int64 using a UNumberFormat. * The int64 will be formatted according to the UNumberFormat's locale. * @param fmt The formatter to use. * @param number The number to format. * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength The maximum size of result. * @param pos A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case no field * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_format * @see unum_formatDouble * @see unum_parse * @see unum_parseInt64 * @see unum_parseDouble * @see UFieldPosition * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_formatInt64(const UNumberFormat *fmt, int64_t number, UChar* result, int32_t resultLength, UFieldPosition *pos, UErrorCode* status); /** * Format a double using a UNumberFormat. * The double will be formatted according to the UNumberFormat's locale. * @param fmt The formatter to use. * @param number The number to format. * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength The maximum size of result. * @param pos A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case no field * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_format * @see unum_formatInt64 * @see unum_parse * @see unum_parseInt64 * @see unum_parseDouble * @see UFieldPosition * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_formatDouble( const UNumberFormat* fmt, double number, UChar* result, int32_t resultLength, UFieldPosition *pos, /* 0 if ignore */ UErrorCode* status); #if (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Format a double using a UNumberFormat according to the UNumberFormat's locale, * and initialize a UFieldPositionIterator that enumerates the subcomponents of * the resulting string. * * @param format * The formatter to use. * @param number * The number to format. * @param result * A pointer to a buffer to receive the NULL-terminated formatted * number. If the formatted number fits into dest but cannot be * NULL-terminated (length == resultLength) then the error code is set * to U_STRING_NOT_TERMINATED_WARNING. If the formatted number doesn't * fit into result then the error code is set to * U_BUFFER_OVERFLOW_ERROR. * @param resultLength * The maximum size of result. * @param fpositer * A pointer to a UFieldPositionIterator created by {@link #ufieldpositer_open} * (may be NULL if field position information is not needed, but in this * case it's preferable to use {@link #unum_formatDouble}). Iteration * information already present in the UFieldPositionIterator is deleted, * and the iterator is reset to apply to the fields in the formatted * string created by this function call. The field values and indexes * returned by {@link #ufieldpositer_next} represent fields denoted by * the UNumberFormatFields enum. Fields are not returned in a guaranteed * order. Fields cannot overlap, but they may nest. For example, 1234 * could format as "1,234" which might consist of a grouping separator * field for ',' and an integer field encompassing the entire string. * @param status * A pointer to an UErrorCode to receive any errors * @return * The total buffer size needed; if greater than resultLength, the * output was truncated. * @see unum_formatDouble * @see unum_parse * @see unum_parseDouble * @see UFieldPositionIterator * @see UNumberFormatFields * @stable ICU 59 */ U_CAPI int32_t U_EXPORT2 unum_formatDoubleForFields(const UNumberFormat* format, double number, UChar* result, int32_t resultLength, UFieldPositionIterator* fpositer, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Format a decimal number using a UNumberFormat. * The number will be formatted according to the UNumberFormat's locale. * The syntax of the input number is a "numeric string" * as defined in the Decimal Arithmetic Specification, available at * http://speleotrove.com/decimal * @param fmt The formatter to use. * @param number The number to format. * @param length The length of the input number, or -1 if the input is nul-terminated. * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength The maximum size of result. * @param pos A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case it is ignored. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_format * @see unum_formatInt64 * @see unum_parse * @see unum_parseInt64 * @see unum_parseDouble * @see UFieldPosition * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 unum_formatDecimal( const UNumberFormat* fmt, const char * number, int32_t length, UChar* result, int32_t resultLength, UFieldPosition *pos, /* 0 if ignore */ UErrorCode* status); /** * Format a double currency amount using a UNumberFormat. * The double will be formatted according to the UNumberFormat's locale. * * To format an exact decimal value with a currency, use * `unum_setTextAttribute(UNUM_CURRENCY_CODE, ...)` followed by unum_formatDecimal. * Your UNumberFormat must be created with the UNUM_CURRENCY style. Alternatively, * consider using unumf_openForSkeletonAndLocale. * * @param fmt the formatter to use * @param number the number to format * @param currency the 3-letter null-terminated ISO 4217 currency code * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength the maximum number of UChars to write to result * @param pos a pointer to a UFieldPosition. On input, * position->field is read. On output, position->beginIndex and * position->endIndex indicate the beginning and ending indices of * field number position->field, if such a field exists. This * parameter may be NULL, in which case it is ignored. * @param status a pointer to an input-output UErrorCode * @return the total buffer size needed; if greater than resultLength, * the output was truncated. * @see unum_formatDouble * @see unum_parseDoubleCurrency * @see UFieldPosition * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 unum_formatDoubleCurrency(const UNumberFormat* fmt, double number, UChar* currency, UChar* result, int32_t resultLength, UFieldPosition* pos, UErrorCode* status); /** * Format a UFormattable into a string. * @param fmt the formatter to use * @param number the number to format, as a UFormattable * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength the maximum number of UChars to write to result * @param pos a pointer to a UFieldPosition. On input, * position->field is read. On output, position->beginIndex and * position->endIndex indicate the beginning and ending indices of * field number position->field, if such a field exists. This * parameter may be NULL, in which case it is ignored. * @param status a pointer to an input-output UErrorCode * @return the total buffer size needed; if greater than resultLength, * the output was truncated. Will return 0 on error. * @see unum_parseToUFormattable * @stable ICU 52 */ U_CAPI int32_t U_EXPORT2 unum_formatUFormattable(const UNumberFormat* fmt, const UFormattable *number, UChar *result, int32_t resultLength, UFieldPosition *pos, UErrorCode *status); /** * Parse a string into an integer using a UNumberFormat. * The string will be parsed according to the UNumberFormat's locale. * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not NULL, on input a pointer to an integer specifying the offset at which * to begin parsing. If not NULL, on output the offset at which parsing ended. * @param status A pointer to an UErrorCode to receive any errors * @return The value of the parsed integer * @see unum_parseInt64 * @see unum_parseDouble * @see unum_format * @see unum_formatInt64 * @see unum_formatDouble * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_parse( const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t *parsePos /* 0 = start */, UErrorCode *status); /** * Parse a string into an int64 using a UNumberFormat. * The string will be parsed according to the UNumberFormat's locale. * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not NULL, on input a pointer to an integer specifying the offset at which * to begin parsing. If not NULL, on output the offset at which parsing ended. * @param status A pointer to an UErrorCode to receive any errors * @return The value of the parsed integer * @see unum_parse * @see unum_parseDouble * @see unum_format * @see unum_formatInt64 * @see unum_formatDouble * @stable ICU 2.8 */ U_CAPI int64_t U_EXPORT2 unum_parseInt64(const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t *parsePos /* 0 = start */, UErrorCode *status); /** * Parse a string into a double using a UNumberFormat. * The string will be parsed according to the UNumberFormat's locale. * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not NULL, on input a pointer to an integer specifying the offset at which * to begin parsing. If not NULL, on output the offset at which parsing ended. * @param status A pointer to an UErrorCode to receive any errors * @return The value of the parsed double * @see unum_parse * @see unum_parseInt64 * @see unum_format * @see unum_formatInt64 * @see unum_formatDouble * @stable ICU 2.0 */ U_CAPI double U_EXPORT2 unum_parseDouble( const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t *parsePos /* 0 = start */, UErrorCode *status); /** * Parse a number from a string into an unformatted numeric string using a UNumberFormat. * The input string will be parsed according to the UNumberFormat's locale. * The syntax of the output is a "numeric string" * as defined in the Decimal Arithmetic Specification, available at * http://speleotrove.com/decimal * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not NULL, on input a pointer to an integer specifying the offset at which * to begin parsing. If not NULL, on output the offset at which parsing ended. * @param outBuf A (char *) buffer to receive the parsed number as a string. The output string * will be nul-terminated if there is sufficient space. * @param outBufLength The size of the output buffer. May be zero, in which case * the outBuf pointer may be NULL, and the function will return the * size of the output string. * @param status A pointer to an UErrorCode to receive any errors * @return the length of the output string, not including any terminating nul. * @see unum_parse * @see unum_parseInt64 * @see unum_format * @see unum_formatInt64 * @see unum_formatDouble * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 unum_parseDecimal(const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t *parsePos /* 0 = start */, char *outBuf, int32_t outBufLength, UErrorCode *status); /** * Parse a string into a double and a currency using a UNumberFormat. * The string will be parsed according to the UNumberFormat's locale. * @param fmt the formatter to use * @param text the text to parse * @param textLength the length of text, or -1 if null-terminated * @param parsePos a pointer to an offset index into text at which to * begin parsing. On output, *parsePos will point after the last * parsed character. This parameter may be NULL, in which case parsing * begins at offset 0. * @param currency a pointer to the buffer to receive the parsed null- * terminated currency. This buffer must have a capacity of at least * 4 UChars. * @param status a pointer to an input-output UErrorCode * @return the parsed double * @see unum_parseDouble * @see unum_formatDoubleCurrency * @stable ICU 3.0 */ U_CAPI double U_EXPORT2 unum_parseDoubleCurrency(const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t* parsePos, /* 0 = start */ UChar* currency, UErrorCode* status); /** * Parse a UChar string into a UFormattable. * Example code: * \snippet test/cintltst/cnumtst.c unum_parseToUFormattable * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt the formatter to use * @param result the UFormattable to hold the result. If NULL, a new UFormattable will be allocated (which the caller must close with ufmt_close). * @param text the text to parse * @param textLength the length of text, or -1 if null-terminated * @param parsePos a pointer to an offset index into text at which to * begin parsing. On output, *parsePos will point after the last * parsed character. This parameter may be NULL in which case parsing * begins at offset 0. * @param status a pointer to an input-output UErrorCode * @return the UFormattable. Will be ==result unless NULL was passed in for result, in which case it will be the newly opened UFormattable. * @see ufmt_getType * @see ufmt_close * @stable ICU 52 */ U_CAPI UFormattable* U_EXPORT2 unum_parseToUFormattable(const UNumberFormat* fmt, UFormattable *result, const UChar* text, int32_t textLength, int32_t* parsePos, /* 0 = start */ UErrorCode* status); /** * Set the pattern used by a UNumberFormat. This can only be used * on a DecimalFormat, other formats return U_UNSUPPORTED_ERROR * in the status. * @param format The formatter to set. * @param localized true if the pattern is localized, false otherwise. * @param pattern The new pattern * @param patternLength The length of pattern, or -1 if null-terminated. * @param parseError A pointer to UParseError to receive information * about errors occurred during parsing, or NULL if no parse error * information is desired. * @param status A pointer to an input-output UErrorCode. * @see unum_toPattern * @see DecimalFormat * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_applyPattern( UNumberFormat *format, UBool localized, const UChar *pattern, int32_t patternLength, UParseError *parseError, UErrorCode *status ); /** * Get a locale for which decimal formatting patterns are available. * A UNumberFormat in a locale returned by this function will perform the correct * formatting and parsing for the locale. The results of this call are not * valid for rule-based number formats. * @param localeIndex The index of the desired locale. * @return A locale for which number formatting patterns are available, or 0 if none. * @see unum_countAvailable * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 unum_getAvailable(int32_t localeIndex); /** * Determine how many locales have decimal formatting patterns available. The * results of this call are not valid for rule-based number formats. * This function is useful for determining the loop ending condition for * calls to {@link #unum_getAvailable }. * @return The number of locales for which decimal formatting patterns are available. * @see unum_getAvailable * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_countAvailable(void); #if UCONFIG_HAVE_PARSEALLINPUT /* The UNumberFormatAttributeValue type cannot be #ifndef U_HIDE_INTERNAL_API, needed for .h variable declaration */ /** * @internal */ typedef enum UNumberFormatAttributeValue { /** @internal */ UNUM_FORMAT_ATTRIBUTE_VALUE_HIDDEN } UNumberFormatAttributeValue; #endif /** The possible UNumberFormat numeric attributes @stable ICU 2.0 */ typedef enum UNumberFormatAttribute { /** Parse integers only */ UNUM_PARSE_INT_ONLY, /** Use grouping separator */ UNUM_GROUPING_USED, /** Always show decimal point */ UNUM_DECIMAL_ALWAYS_SHOWN, /** Maximum integer digits */ UNUM_MAX_INTEGER_DIGITS, /** Minimum integer digits */ UNUM_MIN_INTEGER_DIGITS, /** Integer digits */ UNUM_INTEGER_DIGITS, /** Maximum fraction digits */ UNUM_MAX_FRACTION_DIGITS, /** Minimum fraction digits */ UNUM_MIN_FRACTION_DIGITS, /** Fraction digits */ UNUM_FRACTION_DIGITS, /** Multiplier */ UNUM_MULTIPLIER, /** Grouping size */ UNUM_GROUPING_SIZE, /** Rounding Mode */ UNUM_ROUNDING_MODE, /** Rounding increment */ UNUM_ROUNDING_INCREMENT, /** The width to which the output of format() is padded. */ UNUM_FORMAT_WIDTH, /** The position at which padding will take place. */ UNUM_PADDING_POSITION, /** Secondary grouping size */ UNUM_SECONDARY_GROUPING_SIZE, /** Use significant digits * @stable ICU 3.0 */ UNUM_SIGNIFICANT_DIGITS_USED, /** Minimum significant digits * @stable ICU 3.0 */ UNUM_MIN_SIGNIFICANT_DIGITS, /** Maximum significant digits * @stable ICU 3.0 */ UNUM_MAX_SIGNIFICANT_DIGITS, /** Lenient parse mode used by rule-based formats. * @stable ICU 3.0 */ UNUM_LENIENT_PARSE, #if UCONFIG_HAVE_PARSEALLINPUT /** Consume all input. (may use fastpath). Set to UNUM_YES (require fastpath), UNUM_NO (skip fastpath), or UNUM_MAYBE (heuristic). * This is an internal ICU API. Do not use. * @internal */ UNUM_PARSE_ALL_INPUT = 20, #endif /** * Scale, which adjusts the position of the * decimal point when formatting. Amounts will be multiplied by 10 ^ (scale) * before they are formatted. The default value for the scale is 0 ( no adjustment ). * *

Example: setting the scale to 3, 123 formats as "123,000" *

Example: setting the scale to -4, 123 formats as "0.0123" * * This setting is analogous to getMultiplierScale() and setMultiplierScale() in decimfmt.h. * * @stable ICU 51 */ UNUM_SCALE = 21, #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Minimum grouping digits; most commonly set to 2 to print "1000" instead of "1,000". * See DecimalFormat::getMinimumGroupingDigits(). * * For better control over grouping strategies, use UNumberFormatter. * * @stable ICU 64 */ UNUM_MINIMUM_GROUPING_DIGITS = 22, #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * if this attribute is set to 0, it is set to UNUM_CURRENCY_STANDARD purpose, * otherwise it is UNUM_CASH_CURRENCY purpose * Default: 0 (UNUM_CURRENCY_STANDARD purpose) * @stable ICU 54 */ UNUM_CURRENCY_USAGE = 23, /** If 1, specifies that if setting the "max integer digits" attribute would truncate a value, set an error status rather than silently truncating. * For example, formatting the value 1234 with 4 max int digits would succeed, but formatting 12345 would fail. There is no effect on parsing. * Default: 0 (not set) * @stable ICU 50 */ UNUM_FORMAT_FAIL_IF_MORE_THAN_MAX_DIGITS = 0x1000, /** * if this attribute is set to 1, specifies that, if the pattern doesn't contain an exponent, the exponent will not be parsed. If the pattern does contain an exponent, this attribute has no effect. * Has no effect on formatting. * Default: 0 (unset) * @stable ICU 50 */ UNUM_PARSE_NO_EXPONENT = 0x1001, /** * if this attribute is set to 1, specifies that, if the pattern contains a * decimal mark the input is required to have one. If this attribute is set to 0, * specifies that input does not have to contain a decimal mark. * Has no effect on formatting. * Default: 0 (unset) * @stable ICU 54 */ UNUM_PARSE_DECIMAL_MARK_REQUIRED = 0x1002, #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Parsing: if set to 1, parsing is sensitive to case (lowercase/uppercase). * * @stable ICU 64 */ UNUM_PARSE_CASE_SENSITIVE = 0x1003, /** * Formatting: if set to 1, whether to show the plus sign on non-negative numbers. * * For better control over sign display, use UNumberFormatter. * * @stable ICU 64 */ UNUM_SIGN_ALWAYS_SHOWN = 0x1004, #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) } UNumberFormatAttribute; /** * Get a numeric attribute associated with a UNumberFormat. * An example of a numeric attribute is the number of integer digits a formatter will produce. * @param fmt The formatter to query. * @param attr The attribute to query; one of UNUM_PARSE_INT_ONLY, UNUM_GROUPING_USED, * UNUM_DECIMAL_ALWAYS_SHOWN, UNUM_MAX_INTEGER_DIGITS, UNUM_MIN_INTEGER_DIGITS, UNUM_INTEGER_DIGITS, * UNUM_MAX_FRACTION_DIGITS, UNUM_MIN_FRACTION_DIGITS, UNUM_FRACTION_DIGITS, UNUM_MULTIPLIER, * UNUM_GROUPING_SIZE, UNUM_ROUNDING_MODE, UNUM_FORMAT_WIDTH, UNUM_PADDING_POSITION, UNUM_SECONDARY_GROUPING_SIZE, * UNUM_SCALE, UNUM_MINIMUM_GROUPING_DIGITS. * @return The value of attr, or -1 if the formatter doesn't have the requested attribute. The caller should use unum_hasAttribute() to tell if the attribute * is available, rather than relaying on this function returning -1. * @see unum_hasAttribute * @see unum_setAttribute * @see unum_getDoubleAttribute * @see unum_setDoubleAttribute * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_getAttribute(const UNumberFormat* fmt, UNumberFormatAttribute attr); /** * Set a numeric attribute associated with a UNumberFormat. * An example of a numeric attribute is the number of integer digits a formatter will produce. If the * formatter does not understand the attribute, the call is ignored. Rule-based formatters only understand * the lenient-parse attribute. The caller can use unum_hasAttribute() to find out if the formatter supports the attribute. * @param fmt The formatter to set. * @param attr The attribute to set; one of UNUM_PARSE_INT_ONLY, UNUM_GROUPING_USED, * UNUM_DECIMAL_ALWAYS_SHOWN, UNUM_MAX_INTEGER_DIGITS, UNUM_MIN_INTEGER_DIGITS, UNUM_INTEGER_DIGITS, * UNUM_MAX_FRACTION_DIGITS, UNUM_MIN_FRACTION_DIGITS, UNUM_FRACTION_DIGITS, UNUM_MULTIPLIER, * UNUM_GROUPING_SIZE, UNUM_ROUNDING_MODE, UNUM_FORMAT_WIDTH, UNUM_PADDING_POSITION, UNUM_SECONDARY_GROUPING_SIZE, * UNUM_LENIENT_PARSE, UNUM_SCALE, UNUM_MINIMUM_GROUPING_DIGITS. * @param newValue The new value of attr. * @see unum_hasAttribute * @see unum_getAttribute * @see unum_getDoubleAttribute * @see unum_setDoubleAttribute * @see unum_getTextAttribute * @see unum_setTextAttribute * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_setAttribute( UNumberFormat* fmt, UNumberFormatAttribute attr, int32_t newValue); /** * Get a numeric attribute associated with a UNumberFormat. * An example of a numeric attribute is the number of integer digits a formatter will produce. * If the formatter does not understand the attribute, -1 is returned. The caller should use unum_hasAttribute() * to determine if the attribute is supported, rather than relying on this function returning -1. * @param fmt The formatter to query. * @param attr The attribute to query; e.g. UNUM_ROUNDING_INCREMENT. * @return The value of attr, or -1 if the formatter doesn't understand the attribute. * @see unum_hasAttribute * @see unum_getAttribute * @see unum_setAttribute * @see unum_setDoubleAttribute * @see unum_getTextAttribute * @see unum_setTextAttribute * @stable ICU 2.0 */ U_CAPI double U_EXPORT2 unum_getDoubleAttribute(const UNumberFormat* fmt, UNumberFormatAttribute attr); /** * Set a numeric attribute associated with a UNumberFormat. * An example of a numeric attribute is the number of integer digits a formatter will produce. * If the formatter does not understand the attribute, this call is ignored. The caller can use * unum_hasAttribute() to tell in advance whether the formatter understands the attribute. * @param fmt The formatter to set. * @param attr The attribute to set; e.g. UNUM_ROUNDING_INCREMENT. * @param newValue The new value of attr. * @see unum_hasAttribute * @see unum_getAttribute * @see unum_setAttribute * @see unum_getDoubleAttribute * @see unum_getTextAttribute * @see unum_setTextAttribute * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_setDoubleAttribute( UNumberFormat* fmt, UNumberFormatAttribute attr, double newValue); /** The possible UNumberFormat text attributes @stable ICU 2.0*/ typedef enum UNumberFormatTextAttribute { /** Positive prefix */ UNUM_POSITIVE_PREFIX, /** Positive suffix */ UNUM_POSITIVE_SUFFIX, /** Negative prefix */ UNUM_NEGATIVE_PREFIX, /** Negative suffix */ UNUM_NEGATIVE_SUFFIX, /** The character used to pad to the format width. */ UNUM_PADDING_CHARACTER, /** The ISO currency code */ UNUM_CURRENCY_CODE, /** * The default rule set, such as "%spellout-numbering-year:", "%spellout-cardinal:", * "%spellout-ordinal-masculine-plural:", "%spellout-ordinal-feminine:", or * "%spellout-ordinal-neuter:". The available public rulesets can be listed using * unum_getTextAttribute with UNUM_PUBLIC_RULESETS. This is only available with * rule-based formatters. * @stable ICU 3.0 */ UNUM_DEFAULT_RULESET, /** * The public rule sets. This is only available with rule-based formatters. * This is a read-only attribute. The public rulesets are returned as a * single string, with each ruleset name delimited by ';' (semicolon). See the * CLDR LDML spec for more information about RBNF rulesets: * http://www.unicode.org/reports/tr35/tr35-numbers.html#Rule-Based_Number_Formatting * @stable ICU 3.0 */ UNUM_PUBLIC_RULESETS } UNumberFormatTextAttribute; /** * Get a text attribute associated with a UNumberFormat. * An example of a text attribute is the suffix for positive numbers. If the formatter * does not understand the attribute, U_UNSUPPORTED_ERROR is returned as the status. * Rule-based formatters only understand UNUM_DEFAULT_RULESET and UNUM_PUBLIC_RULESETS. * @param fmt The formatter to query. * @param tag The attribute to query; one of UNUM_POSITIVE_PREFIX, UNUM_POSITIVE_SUFFIX, * UNUM_NEGATIVE_PREFIX, UNUM_NEGATIVE_SUFFIX, UNUM_PADDING_CHARACTER, UNUM_CURRENCY_CODE, * UNUM_DEFAULT_RULESET, or UNUM_PUBLIC_RULESETS. * @param result A pointer to a buffer to receive the attribute. * @param resultLength The maximum size of result. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_setTextAttribute * @see unum_getAttribute * @see unum_setAttribute * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_getTextAttribute( const UNumberFormat* fmt, UNumberFormatTextAttribute tag, UChar* result, int32_t resultLength, UErrorCode* status); /** * Set a text attribute associated with a UNumberFormat. * An example of a text attribute is the suffix for positive numbers. Rule-based formatters * only understand UNUM_DEFAULT_RULESET. * @param fmt The formatter to set. * @param tag The attribute to set; one of UNUM_POSITIVE_PREFIX, UNUM_POSITIVE_SUFFIX, * UNUM_NEGATIVE_PREFIX, UNUM_NEGATIVE_SUFFIX, UNUM_PADDING_CHARACTER, UNUM_CURRENCY_CODE, * or UNUM_DEFAULT_RULESET. * @param newValue The new value of attr. * @param newValueLength The length of newValue, or -1 if null-terminated. * @param status A pointer to an UErrorCode to receive any errors * @see unum_getTextAttribute * @see unum_getAttribute * @see unum_setAttribute * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_setTextAttribute( UNumberFormat* fmt, UNumberFormatTextAttribute tag, const UChar* newValue, int32_t newValueLength, UErrorCode *status); /** * Extract the pattern from a UNumberFormat. The pattern will follow * the DecimalFormat pattern syntax. * @param fmt The formatter to query. * @param isPatternLocalized true if the pattern should be localized, * false otherwise. This is ignored if the formatter is a rule-based * formatter. * @param result A pointer to a buffer to receive the pattern. * @param resultLength The maximum size of result. * @param status A pointer to an input-output UErrorCode. * @return The total buffer size needed; if greater than resultLength, * the output was truncated. * @see unum_applyPattern * @see DecimalFormat * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_toPattern( const UNumberFormat* fmt, UBool isPatternLocalized, UChar* result, int32_t resultLength, UErrorCode* status); /** * Constants for specifying a number format symbol. * @stable ICU 2.0 */ typedef enum UNumberFormatSymbol { /** The decimal separator */ UNUM_DECIMAL_SEPARATOR_SYMBOL = 0, /** The grouping separator */ UNUM_GROUPING_SEPARATOR_SYMBOL = 1, /** The pattern separator */ UNUM_PATTERN_SEPARATOR_SYMBOL = 2, /** The percent sign */ UNUM_PERCENT_SYMBOL = 3, /** Zero*/ UNUM_ZERO_DIGIT_SYMBOL = 4, /** Character representing a digit in the pattern */ UNUM_DIGIT_SYMBOL = 5, /** The minus sign */ UNUM_MINUS_SIGN_SYMBOL = 6, /** The plus sign */ UNUM_PLUS_SIGN_SYMBOL = 7, /** The currency symbol */ UNUM_CURRENCY_SYMBOL = 8, /** The international currency symbol */ UNUM_INTL_CURRENCY_SYMBOL = 9, /** The monetary separator */ UNUM_MONETARY_SEPARATOR_SYMBOL = 10, /** The exponential symbol */ UNUM_EXPONENTIAL_SYMBOL = 11, /** Per mill symbol */ UNUM_PERMILL_SYMBOL = 12, /** Escape padding character */ UNUM_PAD_ESCAPE_SYMBOL = 13, /** Infinity symbol */ UNUM_INFINITY_SYMBOL = 14, /** Nan symbol */ UNUM_NAN_SYMBOL = 15, /** Significant digit symbol * @stable ICU 3.0 */ UNUM_SIGNIFICANT_DIGIT_SYMBOL = 16, /** The monetary grouping separator * @stable ICU 3.6 */ UNUM_MONETARY_GROUPING_SEPARATOR_SYMBOL = 17, /** One * @stable ICU 4.6 */ UNUM_ONE_DIGIT_SYMBOL = 18, /** Two * @stable ICU 4.6 */ UNUM_TWO_DIGIT_SYMBOL = 19, /** Three * @stable ICU 4.6 */ UNUM_THREE_DIGIT_SYMBOL = 20, /** Four * @stable ICU 4.6 */ UNUM_FOUR_DIGIT_SYMBOL = 21, /** Five * @stable ICU 4.6 */ UNUM_FIVE_DIGIT_SYMBOL = 22, /** Six * @stable ICU 4.6 */ UNUM_SIX_DIGIT_SYMBOL = 23, /** Seven * @stable ICU 4.6 */ UNUM_SEVEN_DIGIT_SYMBOL = 24, /** Eight * @stable ICU 4.6 */ UNUM_EIGHT_DIGIT_SYMBOL = 25, /** Nine * @stable ICU 4.6 */ UNUM_NINE_DIGIT_SYMBOL = 26, /** Multiplication sign * @stable ICU 54 */ UNUM_EXPONENT_MULTIPLICATION_SYMBOL = 27, } UNumberFormatSymbol; /** * Get a symbol associated with a UNumberFormat. * A UNumberFormat uses symbols to represent the special locale-dependent * characters in a number, for example the percent sign. This API is not * supported for rule-based formatters. * @param fmt The formatter to query. * @param symbol The UNumberFormatSymbol constant for the symbol to get * @param buffer The string buffer that will receive the symbol string; * if it is NULL, then only the length of the symbol is returned * @param size The size of the string buffer * @param status A pointer to an UErrorCode to receive any errors * @return The length of the symbol; the buffer is not modified if * length>=size * @see unum_setSymbol * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_getSymbol(const UNumberFormat *fmt, UNumberFormatSymbol symbol, UChar *buffer, int32_t size, UErrorCode *status); /** * Set a symbol associated with a UNumberFormat. * A UNumberFormat uses symbols to represent the special locale-dependent * characters in a number, for example the percent sign. This API is not * supported for rule-based formatters. * @param fmt The formatter to set. * @param symbol The UNumberFormatSymbol constant for the symbol to set * @param value The string to set the symbol to * @param length The length of the string, or -1 for a zero-terminated string * @param status A pointer to an UErrorCode to receive any errors. * @see unum_getSymbol * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_setSymbol(UNumberFormat *fmt, UNumberFormatSymbol symbol, const UChar *value, int32_t length, UErrorCode *status); /** * Get the locale for this number format object. * You can choose between valid and actual locale. * @param fmt The formatter to get the locale from * @param type type of the locale we're looking for (valid or actual) * @param status error code for the operation * @return the locale name * @stable ICU 2.8 */ U_CAPI const char* U_EXPORT2 unum_getLocaleByType(const UNumberFormat *fmt, ULocDataLocaleType type, UErrorCode* status); /** * Set a particular UDisplayContext value in the formatter, such as * UDISPCTX_CAPITALIZATION_FOR_STANDALONE. * @param fmt The formatter for which to set a UDisplayContext value. * @param value The UDisplayContext value to set. * @param status A pointer to an UErrorCode to receive any errors * @stable ICU 53 */ U_CAPI void U_EXPORT2 unum_setContext(UNumberFormat* fmt, UDisplayContext value, UErrorCode* status); /** * Get the formatter's UDisplayContext value for the specified UDisplayContextType, * such as UDISPCTX_TYPE_CAPITALIZATION. * @param fmt The formatter to query. * @param type The UDisplayContextType whose value to return * @param status A pointer to an UErrorCode to receive any errors * @return The UDisplayContextValue for the specified type. * @stable ICU 53 */ U_CAPI UDisplayContext U_EXPORT2 unum_getContext(const UNumberFormat *fmt, UDisplayContextType type, UErrorCode* status); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // udat.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1996-2016, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* */ #ifndef UDAT_H #define UDAT_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: DateFormat * *

Date Format C API

* * Date Format C API consists of functions that convert dates and * times from their internal representations to textual form and back again in a * language-independent manner. Converting from the internal representation (milliseconds * since midnight, January 1, 1970) to text is known as "formatting," and converting * from text to millis is known as "parsing." We currently define only one concrete * structure UDateFormat, which can handle pretty much all normal * date formatting and parsing actions. *

* Date Format helps you to format and parse dates for any locale. Your code can * be completely independent of the locale conventions for months, days of the * week, or even the calendar format: lunar vs. solar. *

* To format a date for the current Locale with default time and date style, * use one of the static factory methods: *

 * \code
 *  UErrorCode status = U_ZERO_ERROR;
 *  UChar *myString;
 *  int32_t myStrlen = 0;
 *  UDateFormat* dfmt = udat_open(UDAT_DEFAULT, UDAT_DEFAULT, NULL, NULL, -1, NULL, -1, &status);
 *  myStrlen = udat_format(dfmt, myDate, NULL, myStrlen, NULL, &status);
 *  if (status==U_BUFFER_OVERFLOW_ERROR){
 *      status=U_ZERO_ERROR;
 *      myString=(UChar*)malloc(sizeof(UChar) * (myStrlen+1) );
 *      udat_format(dfmt, myDate, myString, myStrlen+1, NULL, &status);
 *  }
 * \endcode
 * 
* If you are formatting multiple numbers, it is more efficient to get the * format and use it multiple times so that the system doesn't have to fetch the * information about the local language and country conventions multiple times. *
 * \code
 *  UErrorCode status = U_ZERO_ERROR;
 *  int32_t i, myStrlen = 0;
 *  UChar* myString;
 *  char buffer[1024];
 *  UDate myDateArr[] = { 0.0, 100000000.0, 2000000000.0 }; // test values
 *  UDateFormat* df = udat_open(UDAT_DEFAULT, UDAT_DEFAULT, NULL, NULL, -1, NULL, 0, &status);
 *  for (i = 0; i < 3; i++) {
 *      myStrlen = udat_format(df, myDateArr[i], NULL, myStrlen, NULL, &status);
 *      if(status == U_BUFFER_OVERFLOW_ERROR){
 *          status = U_ZERO_ERROR;
 *          myString = (UChar*)malloc(sizeof(UChar) * (myStrlen+1) );
 *          udat_format(df, myDateArr[i], myString, myStrlen+1, NULL, &status);
 *          printf("%s\n", u_austrcpy(buffer, myString) );
 *          free(myString);
 *      }
 *  }
 * \endcode
 * 
* To get specific fields of a date, you can use UFieldPosition to * get specific fields. *
 * \code
 *  UErrorCode status = U_ZERO_ERROR;
 *  UFieldPosition pos;
 *  UChar *myString;
 *  int32_t myStrlen = 0;
 *  char buffer[1024];
 *
 *  pos.field = 1;  // Same as the DateFormat::EField enum
 *  UDateFormat* dfmt = udat_open(UDAT_DEFAULT, UDAT_DEFAULT, NULL, -1, NULL, 0, &status);
 *  myStrlen = udat_format(dfmt, myDate, NULL, myStrlen, &pos, &status);
 *  if (status==U_BUFFER_OVERFLOW_ERROR){
 *      status=U_ZERO_ERROR;
 *      myString=(UChar*)malloc(sizeof(UChar) * (myStrlen+1) );
 *      udat_format(dfmt, myDate, myString, myStrlen+1, &pos, &status);
 *  }
 *  printf("date format: %s\n", u_austrcpy(buffer, myString));
 *  buffer[pos.endIndex] = 0;   // NULL terminate the string.
 *  printf("UFieldPosition position equals %s\n", &buffer[pos.beginIndex]);
 * \endcode
 * 
* To format a date for a different Locale, specify it in the call to * udat_open() *
 * \code
 *        UDateFormat* df = udat_open(UDAT_SHORT, UDAT_SHORT, "fr_FR", NULL, -1, NULL, 0, &status);
 * \endcode
 * 
* You can use a DateFormat API udat_parse() to parse. *
 * \code
 *  UErrorCode status = U_ZERO_ERROR;
 *  int32_t parsepos=0;
 *  UDate myDate = udat_parse(df, myString, u_strlen(myString), &parsepos, &status);
 * \endcode
 * 
* You can pass in different options for the arguments for date and time style * to control the length of the result; from SHORT to MEDIUM to LONG to FULL. * The exact result depends on the locale, but generally: * see UDateFormatStyle for more details *
    *
  • UDAT_SHORT is completely numeric, such as 12/13/52 or 3:30pm *
  • UDAT_MEDIUM is longer, such as Jan 12, 1952 *
  • UDAT_LONG is longer, such as January 12, 1952 or 3:30:32pm *
  • UDAT_FULL is pretty completely specified, such as * Tuesday, April 12, 1952 AD or 3:30:42pm PST. *
* You can also set the time zone on the format if you wish. *

* You can also use forms of the parse and format methods with Parse Position and * UFieldPosition to allow you to *

    *
  • Progressively parse through pieces of a string. *
  • Align any particular field, or find out where it is for selection * on the screen. *
*

Date and Time Patterns:

* *

Date and time formats are specified by date and time pattern strings. * Within date and time pattern strings, all unquoted ASCII letters [A-Za-z] are reserved * as pattern letters representing calendar fields. UDateFormat supports * the date and time formatting algorithm and pattern letters defined by * UTS#35 * Unicode Locale Data Markup Language (LDML) and further documented for ICU in the * ICU * User Guide.

*/ /** A date formatter. * For usage in C programs. * @stable ICU 2.6 */ typedef void* UDateFormat; /** The possible date/time format styles * @stable ICU 2.6 */ typedef enum UDateFormatStyle { /** Full style */ UDAT_FULL, /** Long style */ UDAT_LONG, /** Medium style */ UDAT_MEDIUM, /** Short style */ UDAT_SHORT, /** Default style */ UDAT_DEFAULT = UDAT_MEDIUM, /** Bitfield for relative date */ UDAT_RELATIVE = (1 << 7), UDAT_FULL_RELATIVE = UDAT_FULL | UDAT_RELATIVE, UDAT_LONG_RELATIVE = UDAT_LONG | UDAT_RELATIVE, UDAT_MEDIUM_RELATIVE = UDAT_MEDIUM | UDAT_RELATIVE, UDAT_SHORT_RELATIVE = UDAT_SHORT | UDAT_RELATIVE, /** No style */ UDAT_NONE = -1, /** * Use the pattern given in the parameter to udat_open * @see udat_open * @stable ICU 50 */ UDAT_PATTERN = -2, } UDateFormatStyle; /* Skeletons for dates. */ /** * Constant for date skeleton with year. * @stable ICU 4.0 */ #define UDAT_YEAR "y" /** * Constant for date skeleton with quarter. * @stable ICU 51 */ #define UDAT_QUARTER "QQQQ" /** * Constant for date skeleton with abbreviated quarter. * @stable ICU 51 */ #define UDAT_ABBR_QUARTER "QQQ" /** * Constant for date skeleton with year and quarter. * @stable ICU 4.0 */ #define UDAT_YEAR_QUARTER "yQQQQ" /** * Constant for date skeleton with year and abbreviated quarter. * @stable ICU 4.0 */ #define UDAT_YEAR_ABBR_QUARTER "yQQQ" /** * Constant for date skeleton with month. * @stable ICU 4.0 */ #define UDAT_MONTH "MMMM" /** * Constant for date skeleton with abbreviated month. * @stable ICU 4.0 */ #define UDAT_ABBR_MONTH "MMM" /** * Constant for date skeleton with numeric month. * @stable ICU 4.0 */ #define UDAT_NUM_MONTH "M" /** * Constant for date skeleton with year and month. * @stable ICU 4.0 */ #define UDAT_YEAR_MONTH "yMMMM" /** * Constant for date skeleton with year and abbreviated month. * @stable ICU 4.0 */ #define UDAT_YEAR_ABBR_MONTH "yMMM" /** * Constant for date skeleton with year and numeric month. * @stable ICU 4.0 */ #define UDAT_YEAR_NUM_MONTH "yM" /** * Constant for date skeleton with day. * @stable ICU 4.0 */ #define UDAT_DAY "d" /** * Constant for date skeleton with year, month, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_YEAR_MONTH_DAY "yMMMMd" /** * Constant for date skeleton with year, abbreviated month, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_YEAR_ABBR_MONTH_DAY "yMMMd" /** * Constant for date skeleton with year, numeric month, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_YEAR_NUM_MONTH_DAY "yMd" /** * Constant for date skeleton with weekday. * @stable ICU 51 */ #define UDAT_WEEKDAY "EEEE" /** * Constant for date skeleton with abbreviated weekday. * @stable ICU 51 */ #define UDAT_ABBR_WEEKDAY "E" /** * Constant for date skeleton with year, month, weekday, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_YEAR_MONTH_WEEKDAY_DAY "yMMMMEEEEd" /** * Constant for date skeleton with year, abbreviated month, weekday, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_YEAR_ABBR_MONTH_WEEKDAY_DAY "yMMMEd" /** * Constant for date skeleton with year, numeric month, weekday, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_YEAR_NUM_MONTH_WEEKDAY_DAY "yMEd" /** * Constant for date skeleton with long month and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_MONTH_DAY "MMMMd" /** * Constant for date skeleton with abbreviated month and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_ABBR_MONTH_DAY "MMMd" /** * Constant for date skeleton with numeric month and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_NUM_MONTH_DAY "Md" /** * Constant for date skeleton with month, weekday, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_MONTH_WEEKDAY_DAY "MMMMEEEEd" /** * Constant for date skeleton with abbreviated month, weekday, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_ABBR_MONTH_WEEKDAY_DAY "MMMEd" /** * Constant for date skeleton with numeric month, weekday, and day. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_NUM_MONTH_WEEKDAY_DAY "MEd" /* Skeletons for times. */ /** * Constant for date skeleton with hour, with the locale's preferred hour format (12 or 24). * @stable ICU 4.0 */ #define UDAT_HOUR "j" /** * Constant for date skeleton with hour in 24-hour presentation. * @stable ICU 51 */ #define UDAT_HOUR24 "H" /** * Constant for date skeleton with minute. * @stable ICU 51 */ #define UDAT_MINUTE "m" /** * Constant for date skeleton with hour and minute, with the locale's preferred hour format (12 or 24). * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_HOUR_MINUTE "jm" /** * Constant for date skeleton with hour and minute in 24-hour presentation. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_HOUR24_MINUTE "Hm" /** * Constant for date skeleton with second. * @stable ICU 51 */ #define UDAT_SECOND "s" /** * Constant for date skeleton with hour, minute, and second, * with the locale's preferred hour format (12 or 24). * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_HOUR_MINUTE_SECOND "jms" /** * Constant for date skeleton with hour, minute, and second in * 24-hour presentation. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_HOUR24_MINUTE_SECOND "Hms" /** * Constant for date skeleton with minute and second. * Used in combinations date + time, date + time + zone, or time + zone. * @stable ICU 4.0 */ #define UDAT_MINUTE_SECOND "ms" /* Skeletons for time zones. */ /** * Constant for generic location format, such as Los Angeles Time; * used in combinations date + time + zone, or time + zone. * @see LDML Date Format Patterns * @see LDML Time Zone Fallback * @stable ICU 51 */ #define UDAT_LOCATION_TZ "VVVV" /** * Constant for generic non-location format, such as Pacific Time; * used in combinations date + time + zone, or time + zone. * @see LDML Date Format Patterns * @see LDML Time Zone Fallback * @stable ICU 51 */ #define UDAT_GENERIC_TZ "vvvv" /** * Constant for generic non-location format, abbreviated if possible, such as PT; * used in combinations date + time + zone, or time + zone. * @see LDML Date Format Patterns * @see LDML Time Zone Fallback * @stable ICU 51 */ #define UDAT_ABBR_GENERIC_TZ "v" /** * Constant for specific non-location format, such as Pacific Daylight Time; * used in combinations date + time + zone, or time + zone. * @see LDML Date Format Patterns * @see LDML Time Zone Fallback * @stable ICU 51 */ #define UDAT_SPECIFIC_TZ "zzzz" /** * Constant for specific non-location format, abbreviated if possible, such as PDT; * used in combinations date + time + zone, or time + zone. * @see LDML Date Format Patterns * @see LDML Time Zone Fallback * @stable ICU 51 */ #define UDAT_ABBR_SPECIFIC_TZ "z" /** * Constant for localized GMT/UTC format, such as GMT+8:00 or HPG-8:00; * used in combinations date + time + zone, or time + zone. * @see LDML Date Format Patterns * @see LDML Time Zone Fallback * @stable ICU 51 */ #define UDAT_ABBR_UTC_TZ "ZZZZ" /* deprecated skeleton constants */ /** * FieldPosition and UFieldPosition selectors for format fields * defined by DateFormat and UDateFormat. * @stable ICU 3.0 */ typedef enum UDateFormatField { /** * FieldPosition and UFieldPosition selector for 'G' field alignment, * corresponding to the UCAL_ERA field. * @stable ICU 3.0 */ UDAT_ERA_FIELD = 0, /** * FieldPosition and UFieldPosition selector for 'y' field alignment, * corresponding to the UCAL_YEAR field. * @stable ICU 3.0 */ UDAT_YEAR_FIELD = 1, /** * FieldPosition and UFieldPosition selector for 'M' field alignment, * corresponding to the UCAL_MONTH field. * @stable ICU 3.0 */ UDAT_MONTH_FIELD = 2, /** * FieldPosition and UFieldPosition selector for 'd' field alignment, * corresponding to the UCAL_DATE field. * @stable ICU 3.0 */ UDAT_DATE_FIELD = 3, /** * FieldPosition and UFieldPosition selector for 'k' field alignment, * corresponding to the UCAL_HOUR_OF_DAY field. * UDAT_HOUR_OF_DAY1_FIELD is used for the one-based 24-hour clock. * For example, 23:59 + 01:00 results in 24:59. * @stable ICU 3.0 */ UDAT_HOUR_OF_DAY1_FIELD = 4, /** * FieldPosition and UFieldPosition selector for 'H' field alignment, * corresponding to the UCAL_HOUR_OF_DAY field. * UDAT_HOUR_OF_DAY0_FIELD is used for the zero-based 24-hour clock. * For example, 23:59 + 01:00 results in 00:59. * @stable ICU 3.0 */ UDAT_HOUR_OF_DAY0_FIELD = 5, /** * FieldPosition and UFieldPosition selector for 'm' field alignment, * corresponding to the UCAL_MINUTE field. * @stable ICU 3.0 */ UDAT_MINUTE_FIELD = 6, /** * FieldPosition and UFieldPosition selector for 's' field alignment, * corresponding to the UCAL_SECOND field. * @stable ICU 3.0 */ UDAT_SECOND_FIELD = 7, /** * FieldPosition and UFieldPosition selector for 'S' field alignment, * corresponding to the UCAL_MILLISECOND field. * * Note: Time formats that use 'S' can display a maximum of three * significant digits for fractional seconds, corresponding to millisecond * resolution and a fractional seconds sub-pattern of SSS. If the * sub-pattern is S or SS, the fractional seconds value will be truncated * (not rounded) to the number of display places specified. If the * fractional seconds sub-pattern is longer than SSS, the additional * display places will be filled with zeros. * @stable ICU 3.0 */ UDAT_FRACTIONAL_SECOND_FIELD = 8, /** * FieldPosition and UFieldPosition selector for 'E' field alignment, * corresponding to the UCAL_DAY_OF_WEEK field. * @stable ICU 3.0 */ UDAT_DAY_OF_WEEK_FIELD = 9, /** * FieldPosition and UFieldPosition selector for 'D' field alignment, * corresponding to the UCAL_DAY_OF_YEAR field. * @stable ICU 3.0 */ UDAT_DAY_OF_YEAR_FIELD = 10, /** * FieldPosition and UFieldPosition selector for 'F' field alignment, * corresponding to the UCAL_DAY_OF_WEEK_IN_MONTH field. * @stable ICU 3.0 */ UDAT_DAY_OF_WEEK_IN_MONTH_FIELD = 11, /** * FieldPosition and UFieldPosition selector for 'w' field alignment, * corresponding to the UCAL_WEEK_OF_YEAR field. * @stable ICU 3.0 */ UDAT_WEEK_OF_YEAR_FIELD = 12, /** * FieldPosition and UFieldPosition selector for 'W' field alignment, * corresponding to the UCAL_WEEK_OF_MONTH field. * @stable ICU 3.0 */ UDAT_WEEK_OF_MONTH_FIELD = 13, /** * FieldPosition and UFieldPosition selector for 'a' field alignment, * corresponding to the UCAL_AM_PM field. * @stable ICU 3.0 */ UDAT_AM_PM_FIELD = 14, /** * FieldPosition and UFieldPosition selector for 'h' field alignment, * corresponding to the UCAL_HOUR field. * UDAT_HOUR1_FIELD is used for the one-based 12-hour clock. * For example, 11:30 PM + 1 hour results in 12:30 AM. * @stable ICU 3.0 */ UDAT_HOUR1_FIELD = 15, /** * FieldPosition and UFieldPosition selector for 'K' field alignment, * corresponding to the UCAL_HOUR field. * UDAT_HOUR0_FIELD is used for the zero-based 12-hour clock. * For example, 11:30 PM + 1 hour results in 00:30 AM. * @stable ICU 3.0 */ UDAT_HOUR0_FIELD = 16, /** * FieldPosition and UFieldPosition selector for 'z' field alignment, * corresponding to the UCAL_ZONE_OFFSET and * UCAL_DST_OFFSET fields. * @stable ICU 3.0 */ UDAT_TIMEZONE_FIELD = 17, /** * FieldPosition and UFieldPosition selector for 'Y' field alignment, * corresponding to the UCAL_YEAR_WOY field. * @stable ICU 3.0 */ UDAT_YEAR_WOY_FIELD = 18, /** * FieldPosition and UFieldPosition selector for 'e' field alignment, * corresponding to the UCAL_DOW_LOCAL field. * @stable ICU 3.0 */ UDAT_DOW_LOCAL_FIELD = 19, /** * FieldPosition and UFieldPosition selector for 'u' field alignment, * corresponding to the UCAL_EXTENDED_YEAR field. * @stable ICU 3.0 */ UDAT_EXTENDED_YEAR_FIELD = 20, /** * FieldPosition and UFieldPosition selector for 'g' field alignment, * corresponding to the UCAL_JULIAN_DAY field. * @stable ICU 3.0 */ UDAT_JULIAN_DAY_FIELD = 21, /** * FieldPosition and UFieldPosition selector for 'A' field alignment, * corresponding to the UCAL_MILLISECONDS_IN_DAY field. * @stable ICU 3.0 */ UDAT_MILLISECONDS_IN_DAY_FIELD = 22, /** * FieldPosition and UFieldPosition selector for 'Z' field alignment, * corresponding to the UCAL_ZONE_OFFSET and * UCAL_DST_OFFSET fields. * @stable ICU 3.0 */ UDAT_TIMEZONE_RFC_FIELD = 23, /** * FieldPosition and UFieldPosition selector for 'v' field alignment, * corresponding to the UCAL_ZONE_OFFSET field. * @stable ICU 3.4 */ UDAT_TIMEZONE_GENERIC_FIELD = 24, /** * FieldPosition selector for 'c' field alignment, * corresponding to the {@link #UCAL_DOW_LOCAL} field. * This displays the stand alone day name, if available. * @stable ICU 3.4 */ UDAT_STANDALONE_DAY_FIELD = 25, /** * FieldPosition selector for 'L' field alignment, * corresponding to the {@link #UCAL_MONTH} field. * This displays the stand alone month name, if available. * @stable ICU 3.4 */ UDAT_STANDALONE_MONTH_FIELD = 26, /** * FieldPosition selector for "Q" field alignment, * corresponding to quarters. This is implemented * using the {@link #UCAL_MONTH} field. This * displays the quarter. * @stable ICU 3.6 */ UDAT_QUARTER_FIELD = 27, /** * FieldPosition selector for the "q" field alignment, * corresponding to stand-alone quarters. This is * implemented using the {@link #UCAL_MONTH} field. * This displays the stand-alone quarter. * @stable ICU 3.6 */ UDAT_STANDALONE_QUARTER_FIELD = 28, /** * FieldPosition and UFieldPosition selector for 'V' field alignment, * corresponding to the UCAL_ZONE_OFFSET field. * @stable ICU 3.8 */ UDAT_TIMEZONE_SPECIAL_FIELD = 29, /** * FieldPosition selector for "U" field alignment, * corresponding to cyclic year names. This is implemented * using the {@link #UCAL_YEAR} field. This displays * the cyclic year name, if available. * @stable ICU 49 */ UDAT_YEAR_NAME_FIELD = 30, /** * FieldPosition selector for 'O' field alignment, * corresponding to the UCAL_ZONE_OFFSET and UCAL_DST_OFFSETfields. * This displays the localized GMT format. * @stable ICU 51 */ UDAT_TIMEZONE_LOCALIZED_GMT_OFFSET_FIELD = 31, /** * FieldPosition selector for 'X' field alignment, * corresponding to the UCAL_ZONE_OFFSET and UCAL_DST_OFFSETfields. * This displays the ISO 8601 local time offset format or UTC indicator ("Z"). * @stable ICU 51 */ UDAT_TIMEZONE_ISO_FIELD = 32, /** * FieldPosition selector for 'x' field alignment, * corresponding to the UCAL_ZONE_OFFSET and UCAL_DST_OFFSET fields. * This displays the ISO 8601 local time offset format. * @stable ICU 51 */ UDAT_TIMEZONE_ISO_LOCAL_FIELD = 33, /** * FieldPosition selector for 'b' field alignment. * Displays midnight and noon for 12am and 12pm, respectively, if available; * otherwise fall back to AM / PM. * @stable ICU 57 */ UDAT_AM_PM_MIDNIGHT_NOON_FIELD = 35, /* FieldPosition selector for 'B' field alignment. * Displays flexible day periods, such as "in the morning", if available. * @stable ICU 57 */ UDAT_FLEXIBLE_DAY_PERIOD_FIELD = 36, } UDateFormatField; /** * Maps from a UDateFormatField to the corresponding UCalendarDateFields. * * Note 1: Since the mapping is many-to-one, there is no inverse mapping. * * Note 2: There is no UErrorCode parameter, so in case of error (UDateFormatField is * unknown or has no corresponding UCalendarDateFields value), the function returns the * current value of UCAL_FIELD_COUNT. However, that value may change from release to * release and is consequently deprecated. For a future-proof runtime way of checking * for errors: * a) First save the value returned by the function when it is passed an invalid value * such as "(UDateFormatField)-1". * b) Then, to test for errors when passing some other UDateFormatField value, check * whether the function returns that saved value. * * @param field the UDateFormatField. * @return the UCalendarDateField. In case of error (UDateFormatField is unknown or has * no corresponding UCalendarDateFields value) this will be the current value of * UCAL_FIELD_COUNT, but that value may change from release to release. * See Note 2 above. * @stable ICU 4.4 */ U_CAPI UCalendarDateFields U_EXPORT2 udat_toCalendarDateField(UDateFormatField field); /** * Open a new UDateFormat for formatting and parsing dates and times. * A UDateFormat may be used to format dates in calls to {@link #udat_format }, * and to parse dates in calls to {@link #udat_parse }. * @param timeStyle The style used to format times; one of UDAT_FULL, UDAT_LONG, * UDAT_MEDIUM, UDAT_SHORT, UDAT_DEFAULT, or UDAT_NONE (relative time styles * are not currently supported). * When the pattern parameter is used, pass in UDAT_PATTERN for both timeStyle and dateStyle. * @param dateStyle The style used to format dates; one of UDAT_FULL, UDAT_LONG, * UDAT_MEDIUM, UDAT_SHORT, UDAT_DEFAULT, UDAT_FULL_RELATIVE, UDAT_LONG_RELATIVE, * UDAT_MEDIUM_RELATIVE, UDAT_SHORT_RELATIVE, or UDAT_NONE. * When the pattern parameter is used, pass in UDAT_PATTERN for both timeStyle and dateStyle. * As currently implemented, * relative date formatting only affects a limited range of calendar days before or * after the current date, based on the CLDR <field type="day">/<relative> data: For * example, in English, "Yesterday", "Today", and "Tomorrow". Outside of this range, * dates are formatted using the corresponding non-relative style. * @param locale The locale specifying the formatting conventions * @param tzID A timezone ID specifying the timezone to use. If 0, use * the default timezone. * @param tzIDLength The length of tzID, or -1 if null-terminated. * @param pattern A pattern specifying the format to use. * @param patternLength The number of characters in the pattern, or -1 if null-terminated. * @param status A pointer to an UErrorCode to receive any errors * @return A pointer to a UDateFormat to use for formatting dates and times, or 0 if * an error occurred. * @stable ICU 2.0 */ U_CAPI UDateFormat* U_EXPORT2 udat_open(UDateFormatStyle timeStyle, UDateFormatStyle dateStyle, const char *locale, const UChar *tzID, int32_t tzIDLength, const UChar *pattern, int32_t patternLength, UErrorCode *status); /** * Close a UDateFormat. * Once closed, a UDateFormat may no longer be used. * @param format The formatter to close. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 udat_close(UDateFormat* format); /** * DateFormat boolean attributes * * @stable ICU 53 */ typedef enum UDateFormatBooleanAttribute { /** * indicates whether whitespace is allowed. Includes trailing dot tolerance. * @stable ICU 53 */ UDAT_PARSE_ALLOW_WHITESPACE = 0, /** * indicates tolerance of numeric data when String data may be assumed. eg: UDAT_YEAR_NAME_FIELD, * UDAT_STANDALONE_MONTH_FIELD, UDAT_DAY_OF_WEEK_FIELD * @stable ICU 53 */ UDAT_PARSE_ALLOW_NUMERIC = 1, /** * indicates tolerance of a partial literal match * e.g. accepting "--mon-02-march-2011" for a pattern of "'--: 'EEE-WW-MMMM-yyyy" * @stable ICU 56 */ UDAT_PARSE_PARTIAL_LITERAL_MATCH = 2, /** * indicates tolerance of pattern mismatch between input data and specified format pattern. * e.g. accepting "September" for a month pattern of MMM ("Sep") * @stable ICU 56 */ UDAT_PARSE_MULTIPLE_PATTERNS_FOR_MATCH = 3, /* Do not conditionalize the following with #ifndef U_HIDE_DEPRECATED_API, * it is needed for layout of DateFormat object. */ #ifndef U_FORCE_HIDE_DEPRECATED_API /** * One more than the highest normal UDateFormatBooleanAttribute value. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ UDAT_BOOLEAN_ATTRIBUTE_COUNT = 4 #endif // U_FORCE_HIDE_DEPRECATED_API } UDateFormatBooleanAttribute; /** * Get a boolean attribute associated with a UDateFormat. * An example would be a true value for a key of UDAT_PARSE_ALLOW_WHITESPACE indicating allowing whitespace leniency. * If the formatter does not understand the attribute, -1 is returned. * @param fmt The formatter to query. * @param attr The attribute to query; e.g. UDAT_PARSE_ALLOW_WHITESPACE. * @param status A pointer to an UErrorCode to receive any errors * @return The value of attr. * @stable ICU 53 */ U_CAPI UBool U_EXPORT2 udat_getBooleanAttribute(const UDateFormat* fmt, UDateFormatBooleanAttribute attr, UErrorCode* status); /** * Set a boolean attribute associated with a UDateFormat. * An example of a boolean attribute is parse leniency control. If the formatter does not understand * the attribute, the call is ignored. * @param fmt The formatter to set. * @param attr The attribute to set; one of UDAT_PARSE_ALLOW_WHITESPACE or UDAT_PARSE_ALLOW_NUMERIC * @param newValue The new value of attr. * @param status A pointer to an UErrorCode to receive any errors * @stable ICU 53 */ U_CAPI void U_EXPORT2 udat_setBooleanAttribute(UDateFormat *fmt, UDateFormatBooleanAttribute attr, UBool newValue, UErrorCode* status); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Hour Cycle. * @stable ICU 67 */ typedef enum UDateFormatHourCycle { /** * Hour in am/pm (0~11) * @stable ICU 67 */ UDAT_HOUR_CYCLE_11, /** * Hour in am/pm (1~12) * @stable ICU 67 */ UDAT_HOUR_CYCLE_12, /** * Hour in day (0~23) * @stable ICU 67 */ UDAT_HOUR_CYCLE_23, /** * Hour in day (1~24) * @stable ICU 67 */ UDAT_HOUR_CYCLE_24 } UDateFormatHourCycle; #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Open a copy of a UDateFormat. * This function performs a deep copy. * @param fmt The format to copy * @param status A pointer to an UErrorCode to receive any errors. * @return A pointer to a UDateFormat identical to fmt. * @stable ICU 2.0 */ U_CAPI UDateFormat* U_EXPORT2 udat_clone(const UDateFormat *fmt, UErrorCode *status); /** * Format a date using a UDateFormat. * The date will be formatted using the conventions specified in {@link #udat_open } * @param format The formatter to use * @param dateToFormat The date to format * @param result A pointer to a buffer to receive the formatted number. * @param resultLength The maximum size of result. * @param position A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case no field * position data is returned. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see udat_parse * @see UFieldPosition * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 udat_format( const UDateFormat* format, UDate dateToFormat, UChar* result, int32_t resultLength, UFieldPosition* position, UErrorCode* status); /** * Format a date using an UDateFormat. * The date will be formatted using the conventions specified in {@link #udat_open } * @param format The formatter to use * @param calendar The calendar to format. The calendar instance might be * mutated if fields are not yet fully calculated, though * the function won't change the logical date and time held * by the instance. * @param result A pointer to a buffer to receive the formatted number. * @param capacity The maximum size of result. * @param position A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case no field * position data is returned. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see udat_format * @see udat_parseCalendar * @see UFieldPosition * @stable ICU 55 */ U_CAPI int32_t U_EXPORT2 udat_formatCalendar( const UDateFormat* format, UCalendar* calendar, UChar* result, int32_t capacity, UFieldPosition* position, UErrorCode* status); /** * Format a date using a UDateFormat. * The date will be formatted using the conventions specified in {@link #udat_open} * @param format * The formatter to use * @param dateToFormat * The date to format * @param result * A pointer to a buffer to receive the formatted number. * @param resultLength * The maximum size of result. * @param fpositer * A pointer to a UFieldPositionIterator created by {@link #ufieldpositer_open} * (may be NULL if field position information is not needed). Any * iteration information already present in the UFieldPositionIterator * will be deleted, and the iterator will be reset to apply to the * fields in the formatted string created by this function call; the * field values provided by {@link #ufieldpositer_next} will be from the * UDateFormatField enum. * @param status * A pointer to a UErrorCode to receive any errors * @return * The total buffer size needed; if greater than resultLength, the output was truncated. * @see udat_parse * @see UFieldPositionIterator * @stable ICU 55 */ U_CAPI int32_t U_EXPORT2 udat_formatForFields( const UDateFormat* format, UDate dateToFormat, UChar* result, int32_t resultLength, UFieldPositionIterator* fpositer, UErrorCode* status); /** * Format a date using a UDateFormat. * The date will be formatted using the conventions specified in {@link #udat_open } * @param format * The formatter to use * @param calendar * The calendar to format. The calendar instance might be mutated if fields * are not yet fully calculated, though the function won't change the logical * date and time held by the instance. * @param result * A pointer to a buffer to receive the formatted number. * @param capacity * The maximum size of result. * @param fpositer * A pointer to a UFieldPositionIterator created by {@link #ufieldpositer_open} * (may be NULL if field position information is not needed). Any * iteration information already present in the UFieldPositionIterator * will be deleted, and the iterator will be reset to apply to the * fields in the formatted string created by this function call; the * field values provided by {@link #ufieldpositer_next} will be from the * UDateFormatField enum. * @param status * A pointer to a UErrorCode to receive any errors * @return * The total buffer size needed; if greater than resultLength, the output was truncated. * @see udat_format * @see udat_parseCalendar * @see UFieldPositionIterator * @stable ICU 55 */ U_CAPI int32_t U_EXPORT2 udat_formatCalendarForFields( const UDateFormat* format, UCalendar* calendar, UChar* result, int32_t capacity, UFieldPositionIterator* fpositer, UErrorCode* status); /** * Parse a string into an date/time using a UDateFormat. * The date will be parsed using the conventions specified in {@link #udat_open }. *

* Note that the normal date formats associated with some calendars - such * as the Chinese lunar calendar - do not specify enough fields to enable * dates to be parsed unambiguously. In the case of the Chinese lunar * calendar, while the year within the current 60-year cycle is specified, * the number of such cycles since the start date of the calendar (in the * UCAL_ERA field of the UCalendar object) is not normally part of the format, * and parsing may assume the wrong era. For cases such as this it is * recommended that clients parse using udat_parseCalendar with the UCalendar * passed in set to the current date, or to a date within the era/cycle that * should be assumed if absent in the format. * * @param format The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not 0, on input a pointer to an integer specifying the offset at which * to begin parsing. If not 0, on output the offset at which parsing ended. * @param status A pointer to an UErrorCode to receive any errors * @return The value of the parsed date/time * @see udat_format * @stable ICU 2.0 */ U_CAPI UDate U_EXPORT2 udat_parse(const UDateFormat* format, const UChar* text, int32_t textLength, int32_t *parsePos, UErrorCode *status); /** * Parse a string into an date/time using a UDateFormat. * The date will be parsed using the conventions specified in {@link #udat_open }. * @param format The formatter to use. * @param calendar A calendar set on input to the date and time to be used for * missing values in the date/time string being parsed, and set * on output to the parsed date/time. When the calendar type is * different from the internal calendar held by the UDateFormat * instance, the internal calendar will be cloned to a work * calendar set to the same milliseconds and time zone as this * calendar parameter, field values will be parsed based on the * work calendar, then the result (milliseconds and time zone) * will be set in this calendar. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not 0, on input a pointer to an integer specifying the offset at which * to begin parsing. If not 0, on output the offset at which parsing ended. * @param status A pointer to an UErrorCode to receive any errors * @see udat_format * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 udat_parseCalendar(const UDateFormat* format, UCalendar* calendar, const UChar* text, int32_t textLength, int32_t *parsePos, UErrorCode *status); /** * Determine if an UDateFormat will perform lenient parsing. * With lenient parsing, the parser may use heuristics to interpret inputs that do not * precisely match the pattern. With strict parsing, inputs must match the pattern. * @param fmt The formatter to query * @return true if fmt is set to perform lenient parsing, false otherwise. * @see udat_setLenient * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 udat_isLenient(const UDateFormat* fmt); /** * Specify whether an UDateFormat will perform lenient parsing. * With lenient parsing, the parser may use heuristics to interpret inputs that do not * precisely match the pattern. With strict parsing, inputs must match the pattern. * @param fmt The formatter to set * @param isLenient true if fmt should perform lenient parsing, false otherwise. * @see dat_isLenient * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 udat_setLenient( UDateFormat* fmt, UBool isLenient); /** * Get the UCalendar associated with an UDateFormat. * A UDateFormat uses a UCalendar to convert a raw value to, for example, * the day of the week. * @param fmt The formatter to query. * @return A pointer to the UCalendar used by fmt. * @see udat_setCalendar * @stable ICU 2.0 */ U_CAPI const UCalendar* U_EXPORT2 udat_getCalendar(const UDateFormat* fmt); /** * Set the UCalendar associated with an UDateFormat. * A UDateFormat uses a UCalendar to convert a raw value to, for example, * the day of the week. * @param fmt The formatter to set. * @param calendarToSet A pointer to an UCalendar to be used by fmt. * @see udat_setCalendar * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 udat_setCalendar( UDateFormat* fmt, const UCalendar* calendarToSet); /** * Get the UNumberFormat associated with an UDateFormat. * A UDateFormat uses a UNumberFormat to format numbers within a date, * for example the day number. * @param fmt The formatter to query. * @return A pointer to the UNumberFormat used by fmt to format numbers. * @see udat_setNumberFormat * @stable ICU 2.0 */ U_CAPI const UNumberFormat* U_EXPORT2 udat_getNumberFormat(const UDateFormat* fmt); /** * Get the UNumberFormat for specific field associated with an UDateFormat. * For example: 'y' for year and 'M' for month * @param fmt The formatter to query. * @param field the field to query * @return A pointer to the UNumberFormat used by fmt to format field numbers. * @see udat_setNumberFormatForField * @stable ICU 54 */ U_CAPI const UNumberFormat* U_EXPORT2 udat_getNumberFormatForField(const UDateFormat* fmt, UChar field); /** * Set the UNumberFormat for specific field associated with an UDateFormat. * It can be a single field like: "y"(year) or "M"(month) * It can be several field combined together: "yM"(year and month) * Note: * 1 symbol field is enough for multiple symbol field (so "y" will override "yy", "yyy") * If the field is not numeric, then override has no effect (like "MMM" will use abbreviation, not numerical field) * * @param fields the fields to set * @param fmt The formatter to set. * @param numberFormatToSet A pointer to the UNumberFormat to be used by fmt to format numbers. * @param status error code passed around (memory allocation or invalid fields) * @see udat_getNumberFormatForField * @stable ICU 54 */ U_CAPI void U_EXPORT2 udat_adoptNumberFormatForFields( UDateFormat* fmt, const UChar* fields, UNumberFormat* numberFormatToSet, UErrorCode* status); /** * Set the UNumberFormat associated with an UDateFormat. * A UDateFormat uses a UNumberFormat to format numbers within a date, * for example the day number. * This method also clears per field NumberFormat instances previously * set by {@see udat_setNumberFormatForField} * @param fmt The formatter to set. * @param numberFormatToSet A pointer to the UNumberFormat to be used by fmt to format numbers. * @see udat_getNumberFormat * @see udat_setNumberFormatForField * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 udat_setNumberFormat( UDateFormat* fmt, const UNumberFormat* numberFormatToSet); /** * Adopt the UNumberFormat associated with an UDateFormat. * A UDateFormat uses a UNumberFormat to format numbers within a date, * for example the day number. * @param fmt The formatter to set. * @param numberFormatToAdopt A pointer to the UNumberFormat to be used by fmt to format numbers. * @see udat_getNumberFormat * @stable ICU 54 */ U_CAPI void U_EXPORT2 udat_adoptNumberFormat( UDateFormat* fmt, UNumberFormat* numberFormatToAdopt); /** * Get a locale for which date/time formatting patterns are available. * A UDateFormat in a locale returned by this function will perform the correct * formatting and parsing for the locale. * @param localeIndex The index of the desired locale. * @return A locale for which date/time formatting patterns are available, or 0 if none. * @see udat_countAvailable * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 udat_getAvailable(int32_t localeIndex); /** * Determine how many locales have date/time formatting patterns available. * This function is most useful as determining the loop ending condition for * calls to {@link #udat_getAvailable }. * @return The number of locales for which date/time formatting patterns are available. * @see udat_getAvailable * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 udat_countAvailable(void); /** * Get the year relative to which all 2-digit years are interpreted. * For example, if the 2-digit start year is 2100, the year 99 will be * interpreted as 2199. * @param fmt The formatter to query. * @param status A pointer to an UErrorCode to receive any errors * @return The year relative to which all 2-digit years are interpreted. * @see udat_Set2DigitYearStart * @stable ICU 2.0 */ U_CAPI UDate U_EXPORT2 udat_get2DigitYearStart( const UDateFormat *fmt, UErrorCode *status); /** * Set the year relative to which all 2-digit years will be interpreted. * For example, if the 2-digit start year is 2100, the year 99 will be * interpreted as 2199. * @param fmt The formatter to set. * @param d The year relative to which all 2-digit years will be interpreted. * @param status A pointer to an UErrorCode to receive any errors * @see udat_Set2DigitYearStart * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 udat_set2DigitYearStart( UDateFormat *fmt, UDate d, UErrorCode *status); /** * Extract the pattern from a UDateFormat. * The pattern will follow the pattern syntax rules. * @param fmt The formatter to query. * @param localized true if the pattern should be localized, false otherwise. * @param result A pointer to a buffer to receive the pattern. * @param resultLength The maximum size of result. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see udat_applyPattern * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 udat_toPattern( const UDateFormat *fmt, UBool localized, UChar *result, int32_t resultLength, UErrorCode *status); /** * Set the pattern used by an UDateFormat. * The pattern should follow the pattern syntax rules. * @param format The formatter to set. * @param localized true if the pattern is localized, false otherwise. * @param pattern The new pattern * @param patternLength The length of pattern, or -1 if null-terminated. * @see udat_toPattern * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 udat_applyPattern( UDateFormat *format, UBool localized, const UChar *pattern, int32_t patternLength); /** * The possible types of date format symbols * @stable ICU 2.6 */ typedef enum UDateFormatSymbolType { /** The era names, for example AD */ UDAT_ERAS, /** The month names, for example February */ UDAT_MONTHS, /** The short month names, for example Feb. */ UDAT_SHORT_MONTHS, /** The CLDR-style format "wide" weekday names, for example Monday */ UDAT_WEEKDAYS, /** * The CLDR-style format "abbreviated" (not "short") weekday names, for example "Mon." * For the CLDR-style format "short" weekday names, use UDAT_SHORTER_WEEKDAYS. */ UDAT_SHORT_WEEKDAYS, /** The AM/PM names, for example AM */ UDAT_AM_PMS, /** The localized characters */ UDAT_LOCALIZED_CHARS, /** The long era names, for example Anno Domini */ UDAT_ERA_NAMES, /** The narrow month names, for example F */ UDAT_NARROW_MONTHS, /** The CLDR-style format "narrow" weekday names, for example "M" */ UDAT_NARROW_WEEKDAYS, /** Standalone context versions of months */ UDAT_STANDALONE_MONTHS, UDAT_STANDALONE_SHORT_MONTHS, UDAT_STANDALONE_NARROW_MONTHS, /** The CLDR-style stand-alone "wide" weekday names */ UDAT_STANDALONE_WEEKDAYS, /** * The CLDR-style stand-alone "abbreviated" (not "short") weekday names. * For the CLDR-style stand-alone "short" weekday names, use UDAT_STANDALONE_SHORTER_WEEKDAYS. */ UDAT_STANDALONE_SHORT_WEEKDAYS, /** The CLDR-style stand-alone "narrow" weekday names */ UDAT_STANDALONE_NARROW_WEEKDAYS, /** The quarters, for example 1st Quarter */ UDAT_QUARTERS, /** The short quarter names, for example Q1 */ UDAT_SHORT_QUARTERS, /** Standalone context versions of quarters */ UDAT_STANDALONE_QUARTERS, UDAT_STANDALONE_SHORT_QUARTERS, /** * The CLDR-style short weekday names, e.g. "Su", Mo", etc. * These are named "SHORTER" to contrast with the constants using _SHORT_ * above, which actually get the CLDR-style *abbreviated* versions of the * corresponding names. * @stable ICU 51 */ UDAT_SHORTER_WEEKDAYS, /** * Standalone version of UDAT_SHORTER_WEEKDAYS. * @stable ICU 51 */ UDAT_STANDALONE_SHORTER_WEEKDAYS, /** * Cyclic year names (only supported for some calendars, and only for FORMAT usage; * udat_setSymbols not supported for UDAT_CYCLIC_YEARS_WIDE) * @stable ICU 54 */ UDAT_CYCLIC_YEARS_WIDE, /** * Cyclic year names (only supported for some calendars, and only for FORMAT usage) * @stable ICU 54 */ UDAT_CYCLIC_YEARS_ABBREVIATED, /** * Cyclic year names (only supported for some calendars, and only for FORMAT usage; * udat_setSymbols not supported for UDAT_CYCLIC_YEARS_NARROW) * @stable ICU 54 */ UDAT_CYCLIC_YEARS_NARROW, /** * Calendar zodiac names (only supported for some calendars, and only for FORMAT usage; * udat_setSymbols not supported for UDAT_ZODIAC_NAMES_WIDE) * @stable ICU 54 */ UDAT_ZODIAC_NAMES_WIDE, /** * Calendar zodiac names (only supported for some calendars, and only for FORMAT usage) * @stable ICU 54 */ UDAT_ZODIAC_NAMES_ABBREVIATED, /** * Calendar zodiac names (only supported for some calendars, and only for FORMAT usage; * udat_setSymbols not supported for UDAT_ZODIAC_NAMES_NARROW) * @stable ICU 54 */ UDAT_ZODIAC_NAMES_NARROW, #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * The narrow quarter names, for example 1 * @stable ICU 70 */ UDAT_NARROW_QUARTERS, /** * The narrow standalone quarter names, for example 1 * @stable ICU 70 */ UDAT_STANDALONE_NARROW_QUARTERS #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) } UDateFormatSymbolType; struct UDateFormatSymbols; /** Date format symbols. * For usage in C programs. * @stable ICU 2.6 */ typedef struct UDateFormatSymbols UDateFormatSymbols; /** * Get the symbols associated with an UDateFormat. * The symbols are what a UDateFormat uses to represent locale-specific data, * for example month or day names. * @param fmt The formatter to query. * @param type The type of symbols to get. One of UDAT_ERAS, UDAT_MONTHS, UDAT_SHORT_MONTHS, * UDAT_WEEKDAYS, UDAT_SHORT_WEEKDAYS, UDAT_AM_PMS, or UDAT_LOCALIZED_CHARS * @param symbolIndex The desired symbol of type type. * @param result A pointer to a buffer to receive the pattern. * @param resultLength The maximum size of result. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see udat_countSymbols * @see udat_setSymbols * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 udat_getSymbols(const UDateFormat *fmt, UDateFormatSymbolType type, int32_t symbolIndex, UChar *result, int32_t resultLength, UErrorCode *status); /** * Count the number of particular symbols for an UDateFormat. * This function is most useful as for determining the loop termination condition * for calls to {@link #udat_getSymbols }. * @param fmt The formatter to query. * @param type The type of symbols to count. One of UDAT_ERAS, UDAT_MONTHS, UDAT_SHORT_MONTHS, * UDAT_WEEKDAYS, UDAT_SHORT_WEEKDAYS, UDAT_AM_PMS, or UDAT_LOCALIZED_CHARS * @return The number of symbols of type type. * @see udat_getSymbols * @see udat_setSymbols * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 udat_countSymbols( const UDateFormat *fmt, UDateFormatSymbolType type); /** * Set the symbols associated with an UDateFormat. * The symbols are what a UDateFormat uses to represent locale-specific data, * for example month or day names. * @param format The formatter to set * @param type The type of symbols to set. One of UDAT_ERAS, UDAT_MONTHS, UDAT_SHORT_MONTHS, * UDAT_WEEKDAYS, UDAT_SHORT_WEEKDAYS, UDAT_AM_PMS, or UDAT_LOCALIZED_CHARS * @param symbolIndex The index of the symbol to set of type type. * @param value The new value * @param valueLength The length of value, or -1 if null-terminated * @param status A pointer to an UErrorCode to receive any errors * @see udat_getSymbols * @see udat_countSymbols * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 udat_setSymbols( UDateFormat *format, UDateFormatSymbolType type, int32_t symbolIndex, UChar *value, int32_t valueLength, UErrorCode *status); /** * Get the locale for this date format object. * You can choose between valid and actual locale. * @param fmt The formatter to get the locale from * @param type type of the locale we're looking for (valid or actual) * @param status error code for the operation * @return the locale name * @stable ICU 2.8 */ U_CAPI const char* U_EXPORT2 udat_getLocaleByType(const UDateFormat *fmt, ULocDataLocaleType type, UErrorCode* status); /** * Set a particular UDisplayContext value in the formatter, such as * UDISPCTX_CAPITALIZATION_FOR_STANDALONE. * @param fmt The formatter for which to set a UDisplayContext value. * @param value The UDisplayContext value to set. * @param status A pointer to an UErrorCode to receive any errors * @stable ICU 51 */ U_CAPI void U_EXPORT2 udat_setContext(UDateFormat* fmt, UDisplayContext value, UErrorCode* status); /** * Get the formatter's UDisplayContext value for the specified UDisplayContextType, * such as UDISPCTX_TYPE_CAPITALIZATION. * @param fmt The formatter to query. * @param type The UDisplayContextType whose value to return * @param status A pointer to an UErrorCode to receive any errors * @return The UDisplayContextValue for the specified type. * @stable ICU 53 */ U_CAPI UDisplayContext U_EXPORT2 udat_getContext(const UDateFormat* fmt, UDisplayContextType type, UErrorCode* status); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // udatpg.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2007-2015, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: udatpg.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2007jul30 * created by: Markus W. Scherer */ #ifndef __UDATPG_H__ #define __UDATPG_H__ /** * \file * \brief C API: Wrapper for icu::DateTimePatternGenerator (unicode/dtptngen.h). * * UDateTimePatternGenerator provides flexible generation of date format patterns, * like "yy-MM-dd". The user can build up the generator by adding successive * patterns. Once that is done, a query can be made using a "skeleton", which is * a pattern which just includes the desired fields and lengths. The generator * will return the "best fit" pattern corresponding to that skeleton. *

The main method people will use is udatpg_getBestPattern, since normally * UDateTimePatternGenerator is pre-built with data from a particular locale. * However, generators can be built directly from other data as well. *

Issue: may be useful to also have a function that returns the list of * fields in a pattern, in order, since we have that internally. * That would be useful for getting the UI order of field elements. */ /** * Opaque type for a date/time pattern generator object. * @stable ICU 3.8 */ typedef void *UDateTimePatternGenerator; /** * Field number constants for udatpg_getAppendItemFormats() and similar functions. * These constants are separate from UDateFormatField despite semantic overlap * because some fields are merged for the date/time pattern generator. * @stable ICU 3.8 */ typedef enum UDateTimePatternField { /** @stable ICU 3.8 */ UDATPG_ERA_FIELD, /** @stable ICU 3.8 */ UDATPG_YEAR_FIELD, /** @stable ICU 3.8 */ UDATPG_QUARTER_FIELD, /** @stable ICU 3.8 */ UDATPG_MONTH_FIELD, /** @stable ICU 3.8 */ UDATPG_WEEK_OF_YEAR_FIELD, /** @stable ICU 3.8 */ UDATPG_WEEK_OF_MONTH_FIELD, /** @stable ICU 3.8 */ UDATPG_WEEKDAY_FIELD, /** @stable ICU 3.8 */ UDATPG_DAY_OF_YEAR_FIELD, /** @stable ICU 3.8 */ UDATPG_DAY_OF_WEEK_IN_MONTH_FIELD, /** @stable ICU 3.8 */ UDATPG_DAY_FIELD, /** @stable ICU 3.8 */ UDATPG_DAYPERIOD_FIELD, /** @stable ICU 3.8 */ UDATPG_HOUR_FIELD, /** @stable ICU 3.8 */ UDATPG_MINUTE_FIELD, /** @stable ICU 3.8 */ UDATPG_SECOND_FIELD, /** @stable ICU 3.8 */ UDATPG_FRACTIONAL_SECOND_FIELD, /** @stable ICU 3.8 */ UDATPG_ZONE_FIELD, /* Do not conditionalize the following with #ifndef U_HIDE_DEPRECATED_API, * it is needed for layout of DateTimePatternGenerator object. */ #ifndef U_FORCE_HIDE_DEPRECATED_API /** * One more than the highest normal UDateTimePatternField value. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ UDATPG_FIELD_COUNT #endif // U_FORCE_HIDE_DEPRECATED_API } UDateTimePatternField; #if (NTDDI_VERSION >= NTDDI_WIN10_VB) /** * Field display name width constants for udatpg_getFieldDisplayName(). * @stable ICU 61 */ typedef enum UDateTimePGDisplayWidth { /** @stable ICU 61 */ UDATPG_WIDE, /** @stable ICU 61 */ UDATPG_ABBREVIATED, /** @stable ICU 61 */ UDATPG_NARROW } UDateTimePGDisplayWidth; #endif // (NTDDI_VERSION >= NTDDI_WIN10_VB) /** * Masks to control forcing the length of specified fields in the returned * pattern to match those in the skeleton (when this would not happen * otherwise). These may be combined to force the length of multiple fields. * Used with udatpg_getBestPatternWithOptions, udatpg_replaceFieldTypesWithOptions. * @stable ICU 4.4 */ typedef enum UDateTimePatternMatchOptions { /** @stable ICU 4.4 */ UDATPG_MATCH_NO_OPTIONS = 0, /** @stable ICU 4.4 */ UDATPG_MATCH_HOUR_FIELD_LENGTH = 1 << UDATPG_HOUR_FIELD, /** @stable ICU 4.4 */ UDATPG_MATCH_ALL_FIELDS_LENGTH = (1 << UDATPG_FIELD_COUNT) - 1 } UDateTimePatternMatchOptions; /** * Status return values from udatpg_addPattern(). * @stable ICU 3.8 */ typedef enum UDateTimePatternConflict { /** @stable ICU 3.8 */ UDATPG_NO_CONFLICT, /** @stable ICU 3.8 */ UDATPG_BASE_CONFLICT, /** @stable ICU 3.8 */ UDATPG_CONFLICT, } UDateTimePatternConflict; /** * Open a generator according to a given locale. * @param locale * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return a pointer to UDateTimePatternGenerator. * @stable ICU 3.8 */ U_CAPI UDateTimePatternGenerator * U_EXPORT2 udatpg_open(const char *locale, UErrorCode *pErrorCode); /** * Open an empty generator, to be constructed with udatpg_addPattern(...) etc. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return a pointer to UDateTimePatternGenerator. * @stable ICU 3.8 */ U_CAPI UDateTimePatternGenerator * U_EXPORT2 udatpg_openEmpty(UErrorCode *pErrorCode); /** * Close a generator. * @param dtpg a pointer to UDateTimePatternGenerator. * @stable ICU 3.8 */ U_CAPI void U_EXPORT2 udatpg_close(UDateTimePatternGenerator *dtpg); /** * Create a copy pf a generator. * @param dtpg a pointer to UDateTimePatternGenerator to be copied. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return a pointer to a new UDateTimePatternGenerator. * @stable ICU 3.8 */ U_CAPI UDateTimePatternGenerator * U_EXPORT2 udatpg_clone(const UDateTimePatternGenerator *dtpg, UErrorCode *pErrorCode); /** * Get the best pattern matching the input skeleton. It is guaranteed to * have all of the fields in the skeleton. * * Note that this function uses a non-const UDateTimePatternGenerator: * It uses a stateful pattern parser which is set up for each generator object, * rather than creating one for each function call. * Consecutive calls to this function do not affect each other, * but this function cannot be used concurrently on a single generator object. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param skeleton * The skeleton is a pattern containing only the variable fields. * For example, "MMMdd" and "mmhh" are skeletons. * @param length the length of skeleton * @param bestPattern * The best pattern found from the given skeleton. * @param capacity the capacity of bestPattern. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return the length of bestPattern. * @stable ICU 3.8 */ U_CAPI int32_t U_EXPORT2 udatpg_getBestPattern(UDateTimePatternGenerator *dtpg, const UChar *skeleton, int32_t length, UChar *bestPattern, int32_t capacity, UErrorCode *pErrorCode); /** * Get the best pattern matching the input skeleton. It is guaranteed to * have all of the fields in the skeleton. * * Note that this function uses a non-const UDateTimePatternGenerator: * It uses a stateful pattern parser which is set up for each generator object, * rather than creating one for each function call. * Consecutive calls to this function do not affect each other, * but this function cannot be used concurrently on a single generator object. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param skeleton * The skeleton is a pattern containing only the variable fields. * For example, "MMMdd" and "mmhh" are skeletons. * @param length the length of skeleton * @param options * Options for forcing the length of specified fields in the * returned pattern to match those in the skeleton (when this * would not happen otherwise). For default behavior, use * UDATPG_MATCH_NO_OPTIONS. * @param bestPattern * The best pattern found from the given skeleton. * @param capacity * the capacity of bestPattern. * @param pErrorCode * a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return the length of bestPattern. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 udatpg_getBestPatternWithOptions(UDateTimePatternGenerator *dtpg, const UChar *skeleton, int32_t length, UDateTimePatternMatchOptions options, UChar *bestPattern, int32_t capacity, UErrorCode *pErrorCode); /** * Get a unique skeleton from a given pattern. For example, * both "MMM-dd" and "dd/MMM" produce the skeleton "MMMdd". * * Note that this function uses a non-const UDateTimePatternGenerator: * It uses a stateful pattern parser which is set up for each generator object, * rather than creating one for each function call. * Consecutive calls to this function do not affect each other, * but this function cannot be used concurrently on a single generator object. * * @param unusedDtpg a pointer to UDateTimePatternGenerator. * This parameter is no longer used. Callers may pass NULL. * @param pattern input pattern, such as "dd/MMM". * @param length the length of pattern. * @param skeleton such as "MMMdd" * @param capacity the capacity of skeleton. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return the length of skeleton. * @stable ICU 3.8 */ U_CAPI int32_t U_EXPORT2 udatpg_getSkeleton(UDateTimePatternGenerator *unusedDtpg, const UChar *pattern, int32_t length, UChar *skeleton, int32_t capacity, UErrorCode *pErrorCode); /** * Get a unique base skeleton from a given pattern. This is the same * as the skeleton, except that differences in length are minimized so * as to only preserve the difference between string and numeric form. So * for example, both "MMM-dd" and "d/MMM" produce the skeleton "MMMd" * (notice the single d). * * Note that this function uses a non-const UDateTimePatternGenerator: * It uses a stateful pattern parser which is set up for each generator object, * rather than creating one for each function call. * Consecutive calls to this function do not affect each other, * but this function cannot be used concurrently on a single generator object. * * @param unusedDtpg a pointer to UDateTimePatternGenerator. * This parameter is no longer used. Callers may pass NULL. * @param pattern input pattern, such as "dd/MMM". * @param length the length of pattern. * @param baseSkeleton such as "Md" * @param capacity the capacity of base skeleton. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return the length of baseSkeleton. * @stable ICU 3.8 */ U_CAPI int32_t U_EXPORT2 udatpg_getBaseSkeleton(UDateTimePatternGenerator *unusedDtpg, const UChar *pattern, int32_t length, UChar *baseSkeleton, int32_t capacity, UErrorCode *pErrorCode); /** * Adds a pattern to the generator. If the pattern has the same skeleton as * an existing pattern, and the override parameter is set, then the previous * value is overridden. Otherwise, the previous value is retained. In either * case, the conflicting status is set and previous vale is stored in * conflicting pattern. *

* Note that single-field patterns (like "MMM") are automatically added, and * don't need to be added explicitly! * * @param dtpg a pointer to UDateTimePatternGenerator. * @param pattern input pattern, such as "dd/MMM" * @param patternLength the length of pattern. * @param override When existing values are to be overridden use true, * otherwise use false. * @param conflictingPattern Previous pattern with the same skeleton. * @param capacity the capacity of conflictingPattern. * @param pLength a pointer to the length of conflictingPattern. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return conflicting status. The value could be UDATPG_NO_CONFLICT, * UDATPG_BASE_CONFLICT or UDATPG_CONFLICT. * @stable ICU 3.8 */ U_CAPI UDateTimePatternConflict U_EXPORT2 udatpg_addPattern(UDateTimePatternGenerator *dtpg, const UChar *pattern, int32_t patternLength, UBool override, UChar *conflictingPattern, int32_t capacity, int32_t *pLength, UErrorCode *pErrorCode); /** * An AppendItem format is a pattern used to append a field if there is no * good match. For example, suppose that the input skeleton is "GyyyyMMMd", * and there is no matching pattern internally, but there is a pattern * matching "yyyyMMMd", say "d-MM-yyyy". Then that pattern is used, plus the * G. The way these two are conjoined is by using the AppendItemFormat for G * (era). So if that value is, say "{0}, {1}" then the final resulting * pattern is "d-MM-yyyy, G". *

* There are actually three available variables: {0} is the pattern so far, * {1} is the element we are adding, and {2} is the name of the element. *

* This reflects the way that the CLDR data is organized. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param field UDateTimePatternField, such as UDATPG_ERA_FIELD * @param value pattern, such as "{0}, {1}" * @param length the length of value. * @stable ICU 3.8 */ U_CAPI void U_EXPORT2 udatpg_setAppendItemFormat(UDateTimePatternGenerator *dtpg, UDateTimePatternField field, const UChar *value, int32_t length); /** * Getter corresponding to setAppendItemFormat. Values below 0 or at or * above UDATPG_FIELD_COUNT are illegal arguments. * * @param dtpg A pointer to UDateTimePatternGenerator. * @param field UDateTimePatternField, such as UDATPG_ERA_FIELD * @param pLength A pointer that will receive the length of appendItemFormat. * @return appendItemFormat for field. * @stable ICU 3.8 */ U_CAPI const UChar * U_EXPORT2 udatpg_getAppendItemFormat(const UDateTimePatternGenerator *dtpg, UDateTimePatternField field, int32_t *pLength); /** * Set the name of field, eg "era" in English for ERA. These are only * used if the corresponding AppendItemFormat is used, and if it contains a * {2} variable. *

* This reflects the way that the CLDR data is organized. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param field UDateTimePatternField * @param value name for the field. * @param length the length of value. * @stable ICU 3.8 */ U_CAPI void U_EXPORT2 udatpg_setAppendItemName(UDateTimePatternGenerator *dtpg, UDateTimePatternField field, const UChar *value, int32_t length); /** * Getter corresponding to setAppendItemNames. Values below 0 or at or above * UDATPG_FIELD_COUNT are illegal arguments. Note: The more general function * for getting date/time field display names is udatpg_getFieldDisplayName. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param field UDateTimePatternField, such as UDATPG_ERA_FIELD * @param pLength A pointer that will receive the length of the name for field. * @return name for field * @see udatpg_getFieldDisplayName * @stable ICU 3.8 */ U_CAPI const UChar * U_EXPORT2 udatpg_getAppendItemName(const UDateTimePatternGenerator *dtpg, UDateTimePatternField field, int32_t *pLength); #if (NTDDI_VERSION >= NTDDI_WIN10_VB) /** * The general interface to get a display name for a particular date/time field, * in one of several possible display widths. * * @param dtpg * A pointer to the UDateTimePatternGenerator object with the localized * display names. * @param field * The desired UDateTimePatternField, such as UDATPG_ERA_FIELD. * @param width * The desired UDateTimePGDisplayWidth, such as UDATPG_ABBREVIATED. * @param fieldName * A pointer to a buffer to receive the NULL-terminated display name. If the name * fits into fieldName but cannot be NULL-terminated (length == capacity) then * the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the name doesn't * fit into fieldName then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param capacity * The size of fieldName (in UChars). * @param pErrorCode * A pointer to a UErrorCode to receive any errors * @return * The full length of the name; if greater than capacity, fieldName contains a * truncated result. * @stable ICU 61 */ U_CAPI int32_t U_EXPORT2 udatpg_getFieldDisplayName(const UDateTimePatternGenerator *dtpg, UDateTimePatternField field, UDateTimePGDisplayWidth width, UChar *fieldName, int32_t capacity, UErrorCode *pErrorCode); #endif // (NTDDI_VERSION >= NTDDI_WIN10_VB) /** * The DateTimeFormat is a message format pattern used to compose date and * time patterns. The default pattern in the root locale is "{1} {0}", where * {1} will be replaced by the date pattern and {0} will be replaced by the * time pattern; however, other locales may specify patterns such as * "{1}, {0}" or "{1} 'at' {0}", etc. *

* This is used when the input skeleton contains both date and time fields, * but there is not a close match among the added patterns. For example, * suppose that this object was created by adding "dd-MMM" and "hh:mm", and * its DateTimeFormat is the default "{1} {0}". Then if the input skeleton * is "MMMdhmm", there is not an exact match, so the input skeleton is * broken up into two components "MMMd" and "hmm". There are close matches * for those two skeletons, so the result is put together with this pattern, * resulting in "d-MMM h:mm". * * There are four DateTimeFormats in a UDateTimePatternGenerator object, * corresponding to date styles UDAT_FULL..UDAT_SHORT. This method sets * all of them to the specified pattern. To set them individually, see * udatpg_setDateTimeFormatForStyle. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param dtFormat * message format pattern, here {1} will be replaced by the date * pattern and {0} will be replaced by the time pattern. * @param length the length of dtFormat. * @stable ICU 3.8 */ U_CAPI void U_EXPORT2 udatpg_setDateTimeFormat(const UDateTimePatternGenerator *dtpg, const UChar *dtFormat, int32_t length); /** * Getter corresponding to setDateTimeFormat. * * There are four DateTimeFormats in a UDateTimePatternGenerator object, * corresponding to date styles UDAT_FULL..UDAT_SHORT. This method gets * the style for UDAT_MEDIUM (the default). To get them individually, see * udatpg_getDateTimeFormatForStyle. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param pLength A pointer that will receive the length of the format * @return dateTimeFormat. * @stable ICU 3.8 */ U_CAPI const UChar * U_EXPORT2 udatpg_getDateTimeFormat(const UDateTimePatternGenerator *dtpg, int32_t *pLength); #if !UCONFIG_NO_FORMATTING #endif /* #if !UCONFIG_NO_FORMATTING */ /** * The decimal value is used in formatting fractions of seconds. If the * skeleton contains fractional seconds, then this is used with the * fractional seconds. For example, suppose that the input pattern is * "hhmmssSSSS", and the best matching pattern internally is "H:mm:ss", and * the decimal string is ",". Then the resulting pattern is modified to be * "H:mm:ss,SSSS" * * @param dtpg a pointer to UDateTimePatternGenerator. * @param decimal * @param length the length of decimal. * @stable ICU 3.8 */ U_CAPI void U_EXPORT2 udatpg_setDecimal(UDateTimePatternGenerator *dtpg, const UChar *decimal, int32_t length); /** * Getter corresponding to setDecimal. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param pLength A pointer that will receive the length of the decimal string. * @return corresponding to the decimal point. * @stable ICU 3.8 */ U_CAPI const UChar * U_EXPORT2 udatpg_getDecimal(const UDateTimePatternGenerator *dtpg, int32_t *pLength); /** * Adjusts the field types (width and subtype) of a pattern to match what is * in a skeleton. That is, if you supply a pattern like "d-M H:m", and a * skeleton of "MMMMddhhmm", then the input pattern is adjusted to be * "dd-MMMM hh:mm". This is used internally to get the best match for the * input skeleton, but can also be used externally. * * Note that this function uses a non-const UDateTimePatternGenerator: * It uses a stateful pattern parser which is set up for each generator object, * rather than creating one for each function call. * Consecutive calls to this function do not affect each other, * but this function cannot be used concurrently on a single generator object. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param pattern Input pattern * @param patternLength the length of input pattern. * @param skeleton * @param skeletonLength the length of input skeleton. * @param dest pattern adjusted to match the skeleton fields widths and subtypes. * @param destCapacity the capacity of dest. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return the length of dest. * @stable ICU 3.8 */ U_CAPI int32_t U_EXPORT2 udatpg_replaceFieldTypes(UDateTimePatternGenerator *dtpg, const UChar *pattern, int32_t patternLength, const UChar *skeleton, int32_t skeletonLength, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode); /** * Adjusts the field types (width and subtype) of a pattern to match what is * in a skeleton. That is, if you supply a pattern like "d-M H:m", and a * skeleton of "MMMMddhhmm", then the input pattern is adjusted to be * "dd-MMMM hh:mm". This is used internally to get the best match for the * input skeleton, but can also be used externally. * * Note that this function uses a non-const UDateTimePatternGenerator: * It uses a stateful pattern parser which is set up for each generator object, * rather than creating one for each function call. * Consecutive calls to this function do not affect each other, * but this function cannot be used concurrently on a single generator object. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param pattern Input pattern * @param patternLength the length of input pattern. * @param skeleton * @param skeletonLength the length of input skeleton. * @param options * Options controlling whether the length of specified fields in the * pattern are adjusted to match those in the skeleton (when this * would not happen otherwise). For default behavior, use * UDATPG_MATCH_NO_OPTIONS. * @param dest pattern adjusted to match the skeleton fields widths and subtypes. * @param destCapacity the capacity of dest. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return the length of dest. * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 udatpg_replaceFieldTypesWithOptions(UDateTimePatternGenerator *dtpg, const UChar *pattern, int32_t patternLength, const UChar *skeleton, int32_t skeletonLength, UDateTimePatternMatchOptions options, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode); /** * Return a UEnumeration list of all the skeletons in canonical form. * Call udatpg_getPatternForSkeleton() to get the corresponding pattern. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call * @return a UEnumeration list of all the skeletons * The caller must close the object. * @stable ICU 3.8 */ U_CAPI UEnumeration * U_EXPORT2 udatpg_openSkeletons(const UDateTimePatternGenerator *dtpg, UErrorCode *pErrorCode); /** * Return a UEnumeration list of all the base skeletons in canonical form. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. * @return a UEnumeration list of all the base skeletons * The caller must close the object. * @stable ICU 3.8 */ U_CAPI UEnumeration * U_EXPORT2 udatpg_openBaseSkeletons(const UDateTimePatternGenerator *dtpg, UErrorCode *pErrorCode); /** * Get the pattern corresponding to a given skeleton. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param skeleton * @param skeletonLength pointer to the length of skeleton. * @param pLength pointer to the length of return pattern. * @return pattern corresponding to a given skeleton. * @stable ICU 3.8 */ U_CAPI const UChar * U_EXPORT2 udatpg_getPatternForSkeleton(const UDateTimePatternGenerator *dtpg, const UChar *skeleton, int32_t skeletonLength, int32_t *pLength); #endif // __UDATPG_H__ #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) #if !UCONFIG_NO_FORMATTING /** * Return the default hour cycle for a locale. Uses the locale that the * UDateTimePatternGenerator was initially created with. * * Cannot be used on an empty UDateTimePatternGenerator instance. * * @param dtpg a pointer to UDateTimePatternGenerator. * @param pErrorCode a pointer to the UErrorCode which must not indicate a * failure before the function call. Set to U_UNSUPPORTED_ERROR * if used on an empty instance. * @return the default hour cycle. * @stable ICU 67 */ U_CAPI UDateFormatHourCycle U_EXPORT2 udatpg_getDefaultHourCycle(const UDateTimePatternGenerator *dtpg, UErrorCode* pErrorCode); #endif /* #if !UCONFIG_NO_FORMATTING */ // unum.h.bak // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2015, International Business Machines Corporation and others. * All Rights Reserved. * Modification History: * * Date Name Description * 06/24/99 helena Integrated Alan's NF enhancements and Java2 bug fixes ******************************************************************************* */ #ifndef _UNUM #define _UNUM #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Compatibility APIs for number formatting. * *

Number Format C API

* *

IMPORTANT: New users with are strongly encouraged to * see if unumberformatter.h fits their use case. Although not deprecated, * this header is provided for backwards compatibility only. * * Number Format C API Provides functions for * formatting and parsing a number. Also provides methods for * determining which locales have number formats, and what their names * are. *

* UNumberFormat helps you to format and parse numbers for any locale. * Your code can be completely independent of the locale conventions * for decimal points, thousands-separators, or even the particular * decimal digits used, or whether the number format is even decimal. * There are different number format styles like decimal, currency, * percent and spellout. *

* To format a number for the current Locale, use one of the static * factory methods: *

 * \code
 *    UChar myString[20];
 *    double myNumber = 7.0;
 *    UErrorCode status = U_ZERO_ERROR;
 *    UNumberFormat* nf = unum_open(UNUM_DEFAULT, NULL, -1, NULL, NULL, &status);
 *    unum_formatDouble(nf, myNumber, myString, 20, NULL, &status);
 *    printf(" Example 1: %s\n", austrdup(myString) ); //austrdup( a function used to convert UChar* to char*)
 * \endcode
 * 
* If you are formatting multiple numbers, it is more efficient to get * the format and use it multiple times so that the system doesn't * have to fetch the information about the local language and country * conventions multiple times. *
 * \code
 * uint32_t i, resultlength, reslenneeded;
 * UErrorCode status = U_ZERO_ERROR;
 * UFieldPosition pos;
 * uint32_t a[] = { 123, 3333, -1234567 };
 * const uint32_t a_len = sizeof(a) / sizeof(a[0]);
 * UNumberFormat* nf;
 * UChar* result = NULL;
 *
 * nf = unum_open(UNUM_DEFAULT, NULL, -1, NULL, NULL, &status);
 * for (i = 0; i < a_len; i++) {
 *    resultlength=0;
 *    reslenneeded=unum_format(nf, a[i], NULL, resultlength, &pos, &status);
 *    result = NULL;
 *    if(status==U_BUFFER_OVERFLOW_ERROR){
 *       status=U_ZERO_ERROR;
 *       resultlength=reslenneeded+1;
 *       result=(UChar*)malloc(sizeof(UChar) * resultlength);
 *       unum_format(nf, a[i], result, resultlength, &pos, &status);
 *    }
 *    printf( " Example 2: %s\n", austrdup(result));
 *    free(result);
 * }
 * \endcode
 * 
* To format a number for a different Locale, specify it in the * call to unum_open(). *
 * \code
 *     UNumberFormat* nf = unum_open(UNUM_DEFAULT, NULL, -1, "fr_FR", NULL, &success)
 * \endcode
 * 
* You can use a NumberFormat API unum_parse() to parse. *
 * \code
 *    UErrorCode status = U_ZERO_ERROR;
 *    int32_t pos=0;
 *    int32_t num;
 *    num = unum_parse(nf, str, u_strlen(str), &pos, &status);
 * \endcode
 * 
* Use UNUM_DECIMAL to get the normal number format for that country. * There are other static options available. Use UNUM_CURRENCY * to get the currency number format for that country. Use UNUM_PERCENT * to get a format for displaying percentages. With this format, a * fraction from 0.53 is displayed as 53%. *

* Use a pattern to create either a DecimalFormat or a RuleBasedNumberFormat * formatter. The pattern must conform to the syntax defined for those * formatters. *

* You can also control the display of numbers with such function as * unum_getAttributes() and unum_setAttributes(), which let you set the * minimum fraction digits, grouping, etc. * @see UNumberFormatAttributes for more details *

* You can also use forms of the parse and format methods with * ParsePosition and UFieldPosition to allow you to: *

    *
  • (a) progressively parse through pieces of a string. *
  • (b) align the decimal point and other areas. *
*

* It is also possible to change or set the symbols used for a particular * locale like the currency symbol, the grouping separator , monetary separator * etc by making use of functions unum_setSymbols() and unum_getSymbols(). */ /** A number formatter. * For usage in C programs. * @stable ICU 2.0 */ typedef void* UNumberFormat; /** The possible number format styles. * @stable ICU 2.0 */ typedef enum UNumberFormatStyle { /** * Decimal format defined by a pattern string. * @stable ICU 3.0 */ UNUM_PATTERN_DECIMAL=0, /** * Decimal format ("normal" style). * @stable ICU 2.0 */ UNUM_DECIMAL=1, /** * Currency format (generic). * Defaults to UNUM_CURRENCY_STANDARD style * (using currency symbol, e.g., "$1.00", with non-accounting * style for negative values e.g. using minus sign). * The specific style may be specified using the -cf- locale key. * @stable ICU 2.0 */ UNUM_CURRENCY=2, /** * Percent format * @stable ICU 2.0 */ UNUM_PERCENT=3, /** * Scientific format * @stable ICU 2.1 */ UNUM_SCIENTIFIC=4, /** * Spellout rule-based format. The default ruleset can be specified/changed using * unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets * can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS. * @stable ICU 2.0 */ UNUM_SPELLOUT=5, /** * Ordinal rule-based format . The default ruleset can be specified/changed using * unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets * can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS. * @stable ICU 3.0 */ UNUM_ORDINAL=6, /** * Duration rule-based format * @stable ICU 3.0 */ UNUM_DURATION=7, /** * Numbering system rule-based format * @stable ICU 4.2 */ UNUM_NUMBERING_SYSTEM=8, /** * Rule-based format defined by a pattern string. * @stable ICU 3.0 */ UNUM_PATTERN_RULEBASED=9, /** * Currency format with an ISO currency code, e.g., "USD1.00". * @stable ICU 4.8 */ UNUM_CURRENCY_ISO=10, /** * Currency format with a pluralized currency name, * e.g., "1.00 US dollar" and "3.00 US dollars". * @stable ICU 4.8 */ UNUM_CURRENCY_PLURAL=11, /** * Currency format for accounting, e.g., "($3.00)" for * negative currency amount instead of "-$3.00" ({@link #UNUM_CURRENCY}). * Overrides any style specified using -cf- key in locale. * @stable ICU 53 */ UNUM_CURRENCY_ACCOUNTING=12, /** * Currency format with a currency symbol given CASH usage, e.g., * "NT$3" instead of "NT$3.23". * @stable ICU 54 */ UNUM_CASH_CURRENCY=13, /** * Decimal format expressed using compact notation * (short form, corresponds to UNumberCompactStyle=UNUM_SHORT) * e.g. "23K", "45B" * @stable ICU 56 */ UNUM_DECIMAL_COMPACT_SHORT=14, /** * Decimal format expressed using compact notation * (long form, corresponds to UNumberCompactStyle=UNUM_LONG) * e.g. "23 thousand", "45 billion" * @stable ICU 56 */ UNUM_DECIMAL_COMPACT_LONG=15, /** * Currency format with a currency symbol, e.g., "$1.00", * using non-accounting style for negative values (e.g. minus sign). * Overrides any style specified using -cf- key in locale. * @stable ICU 56 */ UNUM_CURRENCY_STANDARD=16, /** * Default format * @stable ICU 2.0 */ UNUM_DEFAULT = UNUM_DECIMAL, /** * Alias for UNUM_PATTERN_DECIMAL * @stable ICU 3.0 */ UNUM_IGNORE = UNUM_PATTERN_DECIMAL } UNumberFormatStyle; /** The possible number format rounding modes. * *

* For more detail on rounding modes, see: * https://unicode-org.github.io/icu/userguide/format_parse/numbers/rounding-modes * * @stable ICU 2.0 */ typedef enum UNumberFormatRoundingMode { UNUM_ROUND_CEILING, UNUM_ROUND_FLOOR, UNUM_ROUND_DOWN, UNUM_ROUND_UP, /** * Half-even rounding * @stable, ICU 3.8 */ UNUM_ROUND_HALFEVEN, UNUM_ROUND_HALFDOWN = UNUM_ROUND_HALFEVEN + 1, UNUM_ROUND_HALFUP, /** * ROUND_UNNECESSARY reports an error if formatted result is not exact. * @stable ICU 4.8 */ UNUM_ROUND_UNNECESSARY, /** * Rounds ties toward the odd number. * @stable ICU 69 */ UNUM_ROUND_HALF_ODD, /** * Rounds ties toward +∞. * @stable ICU 69 */ UNUM_ROUND_HALF_CEILING, /** * Rounds ties toward -∞. * @stable ICU 69 */ UNUM_ROUND_HALF_FLOOR, } UNumberFormatRoundingMode; /** The possible number format pad positions. * @stable ICU 2.0 */ typedef enum UNumberFormatPadPosition { UNUM_PAD_BEFORE_PREFIX, UNUM_PAD_AFTER_PREFIX, UNUM_PAD_BEFORE_SUFFIX, UNUM_PAD_AFTER_SUFFIX } UNumberFormatPadPosition; /** * Constants for specifying short or long format. * @stable ICU 51 */ typedef enum UNumberCompactStyle { /** @stable ICU 51 */ UNUM_SHORT, /** @stable ICU 51 */ UNUM_LONG /** @stable ICU 51 */ } UNumberCompactStyle; /** * Constants for specifying currency spacing * @stable ICU 4.8 */ enum UCurrencySpacing { /** @stable ICU 4.8 */ UNUM_CURRENCY_MATCH, /** @stable ICU 4.8 */ UNUM_CURRENCY_SURROUNDING_MATCH, /** @stable ICU 4.8 */ UNUM_CURRENCY_INSERT, /* Do not conditionalize the following with #ifndef U_HIDE_DEPRECATED_API, * it is needed for layout of DecimalFormatSymbols object. */ #ifndef U_FORCE_HIDE_DEPRECATED_API /** * One more than the highest normal UCurrencySpacing value. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ UNUM_CURRENCY_SPACING_COUNT #endif // U_FORCE_HIDE_DEPRECATED_API }; typedef enum UCurrencySpacing UCurrencySpacing; /**< @stable ICU 4.8 */ /** * FieldPosition and UFieldPosition selectors for format fields * defined by NumberFormat and UNumberFormat. * @stable ICU 49 */ typedef enum UNumberFormatFields { /** @stable ICU 49 */ UNUM_INTEGER_FIELD, /** @stable ICU 49 */ UNUM_FRACTION_FIELD, /** @stable ICU 49 */ UNUM_DECIMAL_SEPARATOR_FIELD, /** @stable ICU 49 */ UNUM_EXPONENT_SYMBOL_FIELD, /** @stable ICU 49 */ UNUM_EXPONENT_SIGN_FIELD, /** @stable ICU 49 */ UNUM_EXPONENT_FIELD, /** @stable ICU 49 */ UNUM_GROUPING_SEPARATOR_FIELD, /** @stable ICU 49 */ UNUM_CURRENCY_FIELD, /** @stable ICU 49 */ UNUM_PERCENT_FIELD, /** @stable ICU 49 */ UNUM_PERMILL_FIELD, /** @stable ICU 49 */ UNUM_SIGN_FIELD, /** @stable ICU 64 */ UNUM_MEASURE_UNIT_FIELD, /** @stable ICU 64 */ UNUM_COMPACT_FIELD, } UNumberFormatFields; /** * Selectors with special numeric values to use locale default minimum grouping * digits for the DecimalFormat/UNumberFormat setMinimumGroupingDigits method. * Do not use these constants with the [U]NumberFormatter API. * * @stable ICU 68 */ typedef enum UNumberFormatMinimumGroupingDigits { /** * Display grouping using the default strategy for all locales. * @stable ICU 68 */ UNUM_MINIMUM_GROUPING_DIGITS_AUTO = -2, /** * Display grouping using locale defaults, except do not show grouping on * values smaller than 10000 (such that there is a minimum of two digits * before the first separator). * @stable ICU 68 */ UNUM_MINIMUM_GROUPING_DIGITS_MIN2 = -3, } UNumberFormatMinimumGroupingDigits; /** * Create and return a new UNumberFormat for formatting and parsing * numbers. A UNumberFormat may be used to format numbers by calling * {@link #unum_format }, and to parse numbers by calling {@link #unum_parse }. * The caller must call {@link #unum_close } when done to release resources * used by this object. * @param style The type of number format to open: one of * UNUM_DECIMAL, UNUM_CURRENCY, UNUM_PERCENT, UNUM_SCIENTIFIC, * UNUM_CURRENCY_ISO, UNUM_CURRENCY_PLURAL, UNUM_SPELLOUT, * UNUM_ORDINAL, UNUM_DURATION, UNUM_NUMBERING_SYSTEM, * UNUM_PATTERN_DECIMAL, UNUM_PATTERN_RULEBASED, or UNUM_DEFAULT. * If UNUM_PATTERN_DECIMAL or UNUM_PATTERN_RULEBASED is passed then the * number format is opened using the given pattern, which must conform * to the syntax described in DecimalFormat or RuleBasedNumberFormat, * respectively. * *

NOTE:: New users with are strongly encouraged to * use unumf_openForSkeletonAndLocale instead of unum_open. * * @param pattern A pattern specifying the format to use. * This parameter is ignored unless the style is * UNUM_PATTERN_DECIMAL or UNUM_PATTERN_RULEBASED. * @param patternLength The number of characters in the pattern, or -1 * if null-terminated. This parameter is ignored unless the style is * UNUM_PATTERN. * @param locale A locale identifier to use to determine formatting * and parsing conventions, or NULL to use the default locale. * @param parseErr A pointer to a UParseError struct to receive the * details of any parsing errors, or NULL if no parsing error details * are desired. * @param status A pointer to an input-output UErrorCode. * @return A pointer to a newly created UNumberFormat, or NULL if an * error occurred. * @see unum_close * @see DecimalFormat * @stable ICU 2.0 */ U_CAPI UNumberFormat* U_EXPORT2 unum_open( UNumberFormatStyle style, const UChar* pattern, int32_t patternLength, const char* locale, UParseError* parseErr, UErrorCode* status); /** * Close a UNumberFormat. * Once closed, a UNumberFormat may no longer be used. * @param fmt The formatter to close. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_close(UNumberFormat* fmt); /** * Open a copy of a UNumberFormat. * This function performs a deep copy. * @param fmt The format to copy * @param status A pointer to an UErrorCode to receive any errors. * @return A pointer to a UNumberFormat identical to fmt. * @stable ICU 2.0 */ U_CAPI UNumberFormat* U_EXPORT2 unum_clone(const UNumberFormat *fmt, UErrorCode *status); /** * Format an integer using a UNumberFormat. * The integer will be formatted according to the UNumberFormat's locale. * @param fmt The formatter to use. * @param number The number to format. * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength The maximum size of result. * @param pos A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case no field * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_formatInt64 * @see unum_formatDouble * @see unum_parse * @see unum_parseInt64 * @see unum_parseDouble * @see UFieldPosition * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_format( const UNumberFormat* fmt, int32_t number, UChar* result, int32_t resultLength, UFieldPosition *pos, UErrorCode* status); /** * Format an int64 using a UNumberFormat. * The int64 will be formatted according to the UNumberFormat's locale. * @param fmt The formatter to use. * @param number The number to format. * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength The maximum size of result. * @param pos A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case no field * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_format * @see unum_formatDouble * @see unum_parse * @see unum_parseInt64 * @see unum_parseDouble * @see UFieldPosition * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_formatInt64(const UNumberFormat *fmt, int64_t number, UChar* result, int32_t resultLength, UFieldPosition *pos, UErrorCode* status); /** * Format a double using a UNumberFormat. * The double will be formatted according to the UNumberFormat's locale. * @param fmt The formatter to use. * @param number The number to format. * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength The maximum size of result. * @param pos A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case no field * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_format * @see unum_formatInt64 * @see unum_parse * @see unum_parseInt64 * @see unum_parseDouble * @see UFieldPosition * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_formatDouble( const UNumberFormat* fmt, double number, UChar* result, int32_t resultLength, UFieldPosition *pos, /* 0 if ignore */ UErrorCode* status); /** * Format a double using a UNumberFormat according to the UNumberFormat's locale, * and initialize a UFieldPositionIterator that enumerates the subcomponents of * the resulting string. * * @param format * The formatter to use. * @param number * The number to format. * @param result * A pointer to a buffer to receive the NULL-terminated formatted * number. If the formatted number fits into dest but cannot be * NULL-terminated (length == resultLength) then the error code is set * to U_STRING_NOT_TERMINATED_WARNING. If the formatted number doesn't * fit into result then the error code is set to * U_BUFFER_OVERFLOW_ERROR. * @param resultLength * The maximum size of result. * @param fpositer * A pointer to a UFieldPositionIterator created by {@link #ufieldpositer_open} * (may be NULL if field position information is not needed, but in this * case it's preferable to use {@link #unum_formatDouble}). Iteration * information already present in the UFieldPositionIterator is deleted, * and the iterator is reset to apply to the fields in the formatted * string created by this function call. The field values and indexes * returned by {@link #ufieldpositer_next} represent fields denoted by * the UNumberFormatFields enum. Fields are not returned in a guaranteed * order. Fields cannot overlap, but they may nest. For example, 1234 * could format as "1,234" which might consist of a grouping separator * field for ',' and an integer field encompassing the entire string. * @param status * A pointer to an UErrorCode to receive any errors * @return * The total buffer size needed; if greater than resultLength, the * output was truncated. * @see unum_formatDouble * @see unum_parse * @see unum_parseDouble * @see UFieldPositionIterator * @see UNumberFormatFields * @stable ICU 59 */ U_CAPI int32_t U_EXPORT2 unum_formatDoubleForFields(const UNumberFormat* format, double number, UChar* result, int32_t resultLength, UFieldPositionIterator* fpositer, UErrorCode* status); /** * Format a decimal number using a UNumberFormat. * The number will be formatted according to the UNumberFormat's locale. * The syntax of the input number is a "numeric string" * as defined in the Decimal Arithmetic Specification, available at * http://speleotrove.com/decimal * @param fmt The formatter to use. * @param number The number to format. * @param length The length of the input number, or -1 if the input is nul-terminated. * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength The maximum size of result. * @param pos A pointer to a UFieldPosition. On input, position->field * is read. On output, position->beginIndex and position->endIndex indicate * the beginning and ending indices of field number position->field, if such * a field exists. This parameter may be NULL, in which case it is ignored. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_format * @see unum_formatInt64 * @see unum_parse * @see unum_parseInt64 * @see unum_parseDouble * @see UFieldPosition * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 unum_formatDecimal( const UNumberFormat* fmt, const char * number, int32_t length, UChar* result, int32_t resultLength, UFieldPosition *pos, /* 0 if ignore */ UErrorCode* status); /** * Format a double currency amount using a UNumberFormat. * The double will be formatted according to the UNumberFormat's locale. * * To format an exact decimal value with a currency, use * `unum_setTextAttribute(UNUM_CURRENCY_CODE, ...)` followed by unum_formatDecimal. * Your UNumberFormat must be created with the UNUM_CURRENCY style. Alternatively, * consider using unumf_openForSkeletonAndLocale. * * @param fmt the formatter to use * @param number the number to format * @param currency the 3-letter null-terminated ISO 4217 currency code * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength the maximum number of UChars to write to result * @param pos a pointer to a UFieldPosition. On input, * position->field is read. On output, position->beginIndex and * position->endIndex indicate the beginning and ending indices of * field number position->field, if such a field exists. This * parameter may be NULL, in which case it is ignored. * @param status a pointer to an input-output UErrorCode * @return the total buffer size needed; if greater than resultLength, * the output was truncated. * @see unum_formatDouble * @see unum_parseDoubleCurrency * @see UFieldPosition * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 unum_formatDoubleCurrency(const UNumberFormat* fmt, double number, UChar* currency, UChar* result, int32_t resultLength, UFieldPosition* pos, UErrorCode* status); /** * Format a UFormattable into a string. * @param fmt the formatter to use * @param number the number to format, as a UFormattable * @param result A pointer to a buffer to receive the NULL-terminated formatted number. If * the formatted number fits into dest but cannot be NULL-terminated (length == resultLength) * then the error code is set to U_STRING_NOT_TERMINATED_WARNING. If the formatted number * doesn't fit into result then the error code is set to U_BUFFER_OVERFLOW_ERROR. * @param resultLength the maximum number of UChars to write to result * @param pos a pointer to a UFieldPosition. On input, * position->field is read. On output, position->beginIndex and * position->endIndex indicate the beginning and ending indices of * field number position->field, if such a field exists. This * parameter may be NULL, in which case it is ignored. * @param status a pointer to an input-output UErrorCode * @return the total buffer size needed; if greater than resultLength, * the output was truncated. Will return 0 on error. * @see unum_parseToUFormattable * @stable ICU 52 */ U_CAPI int32_t U_EXPORT2 unum_formatUFormattable(const UNumberFormat* fmt, const UFormattable *number, UChar *result, int32_t resultLength, UFieldPosition *pos, UErrorCode *status); /** * Parse a string into an integer using a UNumberFormat. * The string will be parsed according to the UNumberFormat's locale. * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not NULL, on input a pointer to an integer specifying the offset at which * to begin parsing. If not NULL, on output the offset at which parsing ended. * @param status A pointer to an UErrorCode to receive any errors * @return The value of the parsed integer * @see unum_parseInt64 * @see unum_parseDouble * @see unum_format * @see unum_formatInt64 * @see unum_formatDouble * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_parse( const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t *parsePos /* 0 = start */, UErrorCode *status); /** * Parse a string into an int64 using a UNumberFormat. * The string will be parsed according to the UNumberFormat's locale. * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not NULL, on input a pointer to an integer specifying the offset at which * to begin parsing. If not NULL, on output the offset at which parsing ended. * @param status A pointer to an UErrorCode to receive any errors * @return The value of the parsed integer * @see unum_parse * @see unum_parseDouble * @see unum_format * @see unum_formatInt64 * @see unum_formatDouble * @stable ICU 2.8 */ U_CAPI int64_t U_EXPORT2 unum_parseInt64(const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t *parsePos /* 0 = start */, UErrorCode *status); /** * Parse a string into a double using a UNumberFormat. * The string will be parsed according to the UNumberFormat's locale. * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not NULL, on input a pointer to an integer specifying the offset at which * to begin parsing. If not NULL, on output the offset at which parsing ended. * @param status A pointer to an UErrorCode to receive any errors * @return The value of the parsed double * @see unum_parse * @see unum_parseInt64 * @see unum_format * @see unum_formatInt64 * @see unum_formatDouble * @stable ICU 2.0 */ U_CAPI double U_EXPORT2 unum_parseDouble( const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t *parsePos /* 0 = start */, UErrorCode *status); /** * Parse a number from a string into an unformatted numeric string using a UNumberFormat. * The input string will be parsed according to the UNumberFormat's locale. * The syntax of the output is a "numeric string" * as defined in the Decimal Arithmetic Specification, available at * http://speleotrove.com/decimal * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt The formatter to use. * @param text The text to parse. * @param textLength The length of text, or -1 if null-terminated. * @param parsePos If not NULL, on input a pointer to an integer specifying the offset at which * to begin parsing. If not NULL, on output the offset at which parsing ended. * @param outBuf A (char *) buffer to receive the parsed number as a string. The output string * will be nul-terminated if there is sufficient space. * @param outBufLength The size of the output buffer. May be zero, in which case * the outBuf pointer may be NULL, and the function will return the * size of the output string. * @param status A pointer to an UErrorCode to receive any errors * @return the length of the output string, not including any terminating nul. * @see unum_parse * @see unum_parseInt64 * @see unum_format * @see unum_formatInt64 * @see unum_formatDouble * @stable ICU 4.4 */ U_CAPI int32_t U_EXPORT2 unum_parseDecimal(const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t *parsePos /* 0 = start */, char *outBuf, int32_t outBufLength, UErrorCode *status); /** * Parse a string into a double and a currency using a UNumberFormat. * The string will be parsed according to the UNumberFormat's locale. * @param fmt the formatter to use * @param text the text to parse * @param textLength the length of text, or -1 if null-terminated * @param parsePos a pointer to an offset index into text at which to * begin parsing. On output, *parsePos will point after the last * parsed character. This parameter may be NULL, in which case parsing * begins at offset 0. * @param currency a pointer to the buffer to receive the parsed null- * terminated currency. This buffer must have a capacity of at least * 4 UChars. * @param status a pointer to an input-output UErrorCode * @return the parsed double * @see unum_parseDouble * @see unum_formatDoubleCurrency * @stable ICU 3.0 */ U_CAPI double U_EXPORT2 unum_parseDoubleCurrency(const UNumberFormat* fmt, const UChar* text, int32_t textLength, int32_t* parsePos, /* 0 = start */ UChar* currency, UErrorCode* status); /** * Parse a UChar string into a UFormattable. * Example code: * \snippet test/cintltst/cnumtst.c unum_parseToUFormattable * Note: parsing is not supported for styles UNUM_DECIMAL_COMPACT_SHORT * and UNUM_DECIMAL_COMPACT_LONG. * @param fmt the formatter to use * @param result the UFormattable to hold the result. If NULL, a new UFormattable will be allocated (which the caller must close with ufmt_close). * @param text the text to parse * @param textLength the length of text, or -1 if null-terminated * @param parsePos a pointer to an offset index into text at which to * begin parsing. On output, *parsePos will point after the last * parsed character. This parameter may be NULL in which case parsing * begins at offset 0. * @param status a pointer to an input-output UErrorCode * @return the UFormattable. Will be ==result unless NULL was passed in for result, in which case it will be the newly opened UFormattable. * @see ufmt_getType * @see ufmt_close * @stable ICU 52 */ U_CAPI UFormattable* U_EXPORT2 unum_parseToUFormattable(const UNumberFormat* fmt, UFormattable *result, const UChar* text, int32_t textLength, int32_t* parsePos, /* 0 = start */ UErrorCode* status); /** * Set the pattern used by a UNumberFormat. This can only be used * on a DecimalFormat, other formats return U_UNSUPPORTED_ERROR * in the status. * @param format The formatter to set. * @param localized true if the pattern is localized, false otherwise. * @param pattern The new pattern * @param patternLength The length of pattern, or -1 if null-terminated. * @param parseError A pointer to UParseError to receive information * about errors occurred during parsing, or NULL if no parse error * information is desired. * @param status A pointer to an input-output UErrorCode. * @see unum_toPattern * @see DecimalFormat * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_applyPattern( UNumberFormat *format, UBool localized, const UChar *pattern, int32_t patternLength, UParseError *parseError, UErrorCode *status ); /** * Get a locale for which decimal formatting patterns are available. * A UNumberFormat in a locale returned by this function will perform the correct * formatting and parsing for the locale. The results of this call are not * valid for rule-based number formats. * @param localeIndex The index of the desired locale. * @return A locale for which number formatting patterns are available, or 0 if none. * @see unum_countAvailable * @stable ICU 2.0 */ U_CAPI const char* U_EXPORT2 unum_getAvailable(int32_t localeIndex); /** * Determine how many locales have decimal formatting patterns available. The * results of this call are not valid for rule-based number formats. * This function is useful for determining the loop ending condition for * calls to {@link #unum_getAvailable }. * @return The number of locales for which decimal formatting patterns are available. * @see unum_getAvailable * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_countAvailable(void); #if UCONFIG_HAVE_PARSEALLINPUT /* The UNumberFormatAttributeValue type cannot be #ifndef U_HIDE_INTERNAL_API, needed for .h variable declaration */ /** * @internal */ typedef enum UNumberFormatAttributeValue { /** @internal */ UNUM_FORMAT_ATTRIBUTE_VALUE_HIDDEN } UNumberFormatAttributeValue; #endif /** The possible UNumberFormat numeric attributes @stable ICU 2.0 */ typedef enum UNumberFormatAttribute { /** Parse integers only */ UNUM_PARSE_INT_ONLY, /** Use grouping separator */ UNUM_GROUPING_USED, /** Always show decimal point */ UNUM_DECIMAL_ALWAYS_SHOWN, /** Maximum integer digits */ UNUM_MAX_INTEGER_DIGITS, /** Minimum integer digits */ UNUM_MIN_INTEGER_DIGITS, /** Integer digits */ UNUM_INTEGER_DIGITS, /** Maximum fraction digits */ UNUM_MAX_FRACTION_DIGITS, /** Minimum fraction digits */ UNUM_MIN_FRACTION_DIGITS, /** Fraction digits */ UNUM_FRACTION_DIGITS, /** Multiplier */ UNUM_MULTIPLIER, /** Grouping size */ UNUM_GROUPING_SIZE, /** Rounding Mode */ UNUM_ROUNDING_MODE, /** Rounding increment */ UNUM_ROUNDING_INCREMENT, /** The width to which the output of format() is padded. */ UNUM_FORMAT_WIDTH, /** The position at which padding will take place. */ UNUM_PADDING_POSITION, /** Secondary grouping size */ UNUM_SECONDARY_GROUPING_SIZE, /** Use significant digits * @stable ICU 3.0 */ UNUM_SIGNIFICANT_DIGITS_USED, /** Minimum significant digits * @stable ICU 3.0 */ UNUM_MIN_SIGNIFICANT_DIGITS, /** Maximum significant digits * @stable ICU 3.0 */ UNUM_MAX_SIGNIFICANT_DIGITS, /** Lenient parse mode used by rule-based formats. * @stable ICU 3.0 */ UNUM_LENIENT_PARSE, #if UCONFIG_HAVE_PARSEALLINPUT /** Consume all input. (may use fastpath). Set to UNUM_YES (require fastpath), UNUM_NO (skip fastpath), or UNUM_MAYBE (heuristic). * This is an internal ICU API. Do not use. * @internal */ UNUM_PARSE_ALL_INPUT = 20, #endif /** * Scale, which adjusts the position of the * decimal point when formatting. Amounts will be multiplied by 10 ^ (scale) * before they are formatted. The default value for the scale is 0 ( no adjustment ). * *

Example: setting the scale to 3, 123 formats as "123,000" *

Example: setting the scale to -4, 123 formats as "0.0123" * * This setting is analogous to getMultiplierScale() and setMultiplierScale() in decimfmt.h. * * @stable ICU 51 */ UNUM_SCALE = 21, /** * Minimum grouping digits; most commonly set to 2 to print "1000" instead of "1,000". * See DecimalFormat::getMinimumGroupingDigits(). * * For better control over grouping strategies, use UNumberFormatter. * * @stable ICU 64 */ UNUM_MINIMUM_GROUPING_DIGITS = 22, /** * if this attribute is set to 0, it is set to UNUM_CURRENCY_STANDARD purpose, * otherwise it is UNUM_CASH_CURRENCY purpose * Default: 0 (UNUM_CURRENCY_STANDARD purpose) * @stable ICU 54 */ UNUM_CURRENCY_USAGE = 23, /** If 1, specifies that if setting the "max integer digits" attribute would truncate a value, set an error status rather than silently truncating. * For example, formatting the value 1234 with 4 max int digits would succeed, but formatting 12345 would fail. There is no effect on parsing. * Default: 0 (not set) * @stable ICU 50 */ UNUM_FORMAT_FAIL_IF_MORE_THAN_MAX_DIGITS = 0x1000, /** * if this attribute is set to 1, specifies that, if the pattern doesn't contain an exponent, the exponent will not be parsed. If the pattern does contain an exponent, this attribute has no effect. * Has no effect on formatting. * Default: 0 (unset) * @stable ICU 50 */ UNUM_PARSE_NO_EXPONENT = 0x1001, /** * if this attribute is set to 1, specifies that, if the pattern contains a * decimal mark the input is required to have one. If this attribute is set to 0, * specifies that input does not have to contain a decimal mark. * Has no effect on formatting. * Default: 0 (unset) * @stable ICU 54 */ UNUM_PARSE_DECIMAL_MARK_REQUIRED = 0x1002, /** * Parsing: if set to 1, parsing is sensitive to case (lowercase/uppercase). * * @stable ICU 64 */ UNUM_PARSE_CASE_SENSITIVE = 0x1003, /** * Formatting: if set to 1, whether to show the plus sign on non-negative numbers. * * For better control over sign display, use UNumberFormatter. * * @stable ICU 64 */ UNUM_SIGN_ALWAYS_SHOWN = 0x1004, } UNumberFormatAttribute; /** * Get a numeric attribute associated with a UNumberFormat. * An example of a numeric attribute is the number of integer digits a formatter will produce. * @param fmt The formatter to query. * @param attr The attribute to query; one of UNUM_PARSE_INT_ONLY, UNUM_GROUPING_USED, * UNUM_DECIMAL_ALWAYS_SHOWN, UNUM_MAX_INTEGER_DIGITS, UNUM_MIN_INTEGER_DIGITS, UNUM_INTEGER_DIGITS, * UNUM_MAX_FRACTION_DIGITS, UNUM_MIN_FRACTION_DIGITS, UNUM_FRACTION_DIGITS, UNUM_MULTIPLIER, * UNUM_GROUPING_SIZE, UNUM_ROUNDING_MODE, UNUM_FORMAT_WIDTH, UNUM_PADDING_POSITION, UNUM_SECONDARY_GROUPING_SIZE, * UNUM_SCALE, UNUM_MINIMUM_GROUPING_DIGITS. * @return The value of attr, or -1 if the formatter doesn't have the requested attribute. The caller should use unum_hasAttribute() to tell if the attribute * is available, rather than relaying on this function returning -1. * @see unum_hasAttribute * @see unum_setAttribute * @see unum_getDoubleAttribute * @see unum_setDoubleAttribute * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_getAttribute(const UNumberFormat* fmt, UNumberFormatAttribute attr); /** * Set a numeric attribute associated with a UNumberFormat. * An example of a numeric attribute is the number of integer digits a formatter will produce. If the * formatter does not understand the attribute, the call is ignored. Rule-based formatters only understand * the lenient-parse attribute. The caller can use unum_hasAttribute() to find out if the formatter supports the attribute. * @param fmt The formatter to set. * @param attr The attribute to set; one of UNUM_PARSE_INT_ONLY, UNUM_GROUPING_USED, * UNUM_DECIMAL_ALWAYS_SHOWN, UNUM_MAX_INTEGER_DIGITS, UNUM_MIN_INTEGER_DIGITS, UNUM_INTEGER_DIGITS, * UNUM_MAX_FRACTION_DIGITS, UNUM_MIN_FRACTION_DIGITS, UNUM_FRACTION_DIGITS, UNUM_MULTIPLIER, * UNUM_GROUPING_SIZE, UNUM_ROUNDING_MODE, UNUM_FORMAT_WIDTH, UNUM_PADDING_POSITION, UNUM_SECONDARY_GROUPING_SIZE, * UNUM_LENIENT_PARSE, UNUM_SCALE, UNUM_MINIMUM_GROUPING_DIGITS. * @param newValue The new value of attr. * @see unum_hasAttribute * @see unum_getAttribute * @see unum_getDoubleAttribute * @see unum_setDoubleAttribute * @see unum_getTextAttribute * @see unum_setTextAttribute * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_setAttribute( UNumberFormat* fmt, UNumberFormatAttribute attr, int32_t newValue); /** * Get a numeric attribute associated with a UNumberFormat. * An example of a numeric attribute is the number of integer digits a formatter will produce. * If the formatter does not understand the attribute, -1 is returned. The caller should use unum_hasAttribute() * to determine if the attribute is supported, rather than relying on this function returning -1. * @param fmt The formatter to query. * @param attr The attribute to query; e.g. UNUM_ROUNDING_INCREMENT. * @return The value of attr, or -1 if the formatter doesn't understand the attribute. * @see unum_hasAttribute * @see unum_getAttribute * @see unum_setAttribute * @see unum_setDoubleAttribute * @see unum_getTextAttribute * @see unum_setTextAttribute * @stable ICU 2.0 */ U_CAPI double U_EXPORT2 unum_getDoubleAttribute(const UNumberFormat* fmt, UNumberFormatAttribute attr); /** * Set a numeric attribute associated with a UNumberFormat. * An example of a numeric attribute is the number of integer digits a formatter will produce. * If the formatter does not understand the attribute, this call is ignored. The caller can use * unum_hasAttribute() to tell in advance whether the formatter understands the attribute. * @param fmt The formatter to set. * @param attr The attribute to set; e.g. UNUM_ROUNDING_INCREMENT. * @param newValue The new value of attr. * @see unum_hasAttribute * @see unum_getAttribute * @see unum_setAttribute * @see unum_getDoubleAttribute * @see unum_getTextAttribute * @see unum_setTextAttribute * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_setDoubleAttribute( UNumberFormat* fmt, UNumberFormatAttribute attr, double newValue); /** The possible UNumberFormat text attributes @stable ICU 2.0*/ typedef enum UNumberFormatTextAttribute { /** Positive prefix */ UNUM_POSITIVE_PREFIX, /** Positive suffix */ UNUM_POSITIVE_SUFFIX, /** Negative prefix */ UNUM_NEGATIVE_PREFIX, /** Negative suffix */ UNUM_NEGATIVE_SUFFIX, /** The character used to pad to the format width. */ UNUM_PADDING_CHARACTER, /** The ISO currency code */ UNUM_CURRENCY_CODE, /** * The default rule set, such as "%spellout-numbering-year:", "%spellout-cardinal:", * "%spellout-ordinal-masculine-plural:", "%spellout-ordinal-feminine:", or * "%spellout-ordinal-neuter:". The available public rulesets can be listed using * unum_getTextAttribute with UNUM_PUBLIC_RULESETS. This is only available with * rule-based formatters. * @stable ICU 3.0 */ UNUM_DEFAULT_RULESET, /** * The public rule sets. This is only available with rule-based formatters. * This is a read-only attribute. The public rulesets are returned as a * single string, with each ruleset name delimited by ';' (semicolon). See the * CLDR LDML spec for more information about RBNF rulesets: * http://www.unicode.org/reports/tr35/tr35-numbers.html#Rule-Based_Number_Formatting * @stable ICU 3.0 */ UNUM_PUBLIC_RULESETS } UNumberFormatTextAttribute; /** * Get a text attribute associated with a UNumberFormat. * An example of a text attribute is the suffix for positive numbers. If the formatter * does not understand the attribute, U_UNSUPPORTED_ERROR is returned as the status. * Rule-based formatters only understand UNUM_DEFAULT_RULESET and UNUM_PUBLIC_RULESETS. * @param fmt The formatter to query. * @param tag The attribute to query; one of UNUM_POSITIVE_PREFIX, UNUM_POSITIVE_SUFFIX, * UNUM_NEGATIVE_PREFIX, UNUM_NEGATIVE_SUFFIX, UNUM_PADDING_CHARACTER, UNUM_CURRENCY_CODE, * UNUM_DEFAULT_RULESET, or UNUM_PUBLIC_RULESETS. * @param result A pointer to a buffer to receive the attribute. * @param resultLength The maximum size of result. * @param status A pointer to an UErrorCode to receive any errors * @return The total buffer size needed; if greater than resultLength, the output was truncated. * @see unum_setTextAttribute * @see unum_getAttribute * @see unum_setAttribute * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_getTextAttribute( const UNumberFormat* fmt, UNumberFormatTextAttribute tag, UChar* result, int32_t resultLength, UErrorCode* status); /** * Set a text attribute associated with a UNumberFormat. * An example of a text attribute is the suffix for positive numbers. Rule-based formatters * only understand UNUM_DEFAULT_RULESET. * @param fmt The formatter to set. * @param tag The attribute to set; one of UNUM_POSITIVE_PREFIX, UNUM_POSITIVE_SUFFIX, * UNUM_NEGATIVE_PREFIX, UNUM_NEGATIVE_SUFFIX, UNUM_PADDING_CHARACTER, UNUM_CURRENCY_CODE, * or UNUM_DEFAULT_RULESET. * @param newValue The new value of attr. * @param newValueLength The length of newValue, or -1 if null-terminated. * @param status A pointer to an UErrorCode to receive any errors * @see unum_getTextAttribute * @see unum_getAttribute * @see unum_setAttribute * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_setTextAttribute( UNumberFormat* fmt, UNumberFormatTextAttribute tag, const UChar* newValue, int32_t newValueLength, UErrorCode *status); /** * Extract the pattern from a UNumberFormat. The pattern will follow * the DecimalFormat pattern syntax. * @param fmt The formatter to query. * @param isPatternLocalized true if the pattern should be localized, * false otherwise. This is ignored if the formatter is a rule-based * formatter. * @param result A pointer to a buffer to receive the pattern. * @param resultLength The maximum size of result. * @param status A pointer to an input-output UErrorCode. * @return The total buffer size needed; if greater than resultLength, * the output was truncated. * @see unum_applyPattern * @see DecimalFormat * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_toPattern( const UNumberFormat* fmt, UBool isPatternLocalized, UChar* result, int32_t resultLength, UErrorCode* status); /** * Constants for specifying a number format symbol. * @stable ICU 2.0 */ typedef enum UNumberFormatSymbol { /** The decimal separator */ UNUM_DECIMAL_SEPARATOR_SYMBOL = 0, /** The grouping separator */ UNUM_GROUPING_SEPARATOR_SYMBOL = 1, /** The pattern separator */ UNUM_PATTERN_SEPARATOR_SYMBOL = 2, /** The percent sign */ UNUM_PERCENT_SYMBOL = 3, /** Zero*/ UNUM_ZERO_DIGIT_SYMBOL = 4, /** Character representing a digit in the pattern */ UNUM_DIGIT_SYMBOL = 5, /** The minus sign */ UNUM_MINUS_SIGN_SYMBOL = 6, /** The plus sign */ UNUM_PLUS_SIGN_SYMBOL = 7, /** The currency symbol */ UNUM_CURRENCY_SYMBOL = 8, /** The international currency symbol */ UNUM_INTL_CURRENCY_SYMBOL = 9, /** The monetary separator */ UNUM_MONETARY_SEPARATOR_SYMBOL = 10, /** The exponential symbol */ UNUM_EXPONENTIAL_SYMBOL = 11, /** Per mill symbol */ UNUM_PERMILL_SYMBOL = 12, /** Escape padding character */ UNUM_PAD_ESCAPE_SYMBOL = 13, /** Infinity symbol */ UNUM_INFINITY_SYMBOL = 14, /** Nan symbol */ UNUM_NAN_SYMBOL = 15, /** Significant digit symbol * @stable ICU 3.0 */ UNUM_SIGNIFICANT_DIGIT_SYMBOL = 16, /** The monetary grouping separator * @stable ICU 3.6 */ UNUM_MONETARY_GROUPING_SEPARATOR_SYMBOL = 17, /** One * @stable ICU 4.6 */ UNUM_ONE_DIGIT_SYMBOL = 18, /** Two * @stable ICU 4.6 */ UNUM_TWO_DIGIT_SYMBOL = 19, /** Three * @stable ICU 4.6 */ UNUM_THREE_DIGIT_SYMBOL = 20, /** Four * @stable ICU 4.6 */ UNUM_FOUR_DIGIT_SYMBOL = 21, /** Five * @stable ICU 4.6 */ UNUM_FIVE_DIGIT_SYMBOL = 22, /** Six * @stable ICU 4.6 */ UNUM_SIX_DIGIT_SYMBOL = 23, /** Seven * @stable ICU 4.6 */ UNUM_SEVEN_DIGIT_SYMBOL = 24, /** Eight * @stable ICU 4.6 */ UNUM_EIGHT_DIGIT_SYMBOL = 25, /** Nine * @stable ICU 4.6 */ UNUM_NINE_DIGIT_SYMBOL = 26, /** Multiplication sign * @stable ICU 54 */ UNUM_EXPONENT_MULTIPLICATION_SYMBOL = 27, } UNumberFormatSymbol; /** * Get a symbol associated with a UNumberFormat. * A UNumberFormat uses symbols to represent the special locale-dependent * characters in a number, for example the percent sign. This API is not * supported for rule-based formatters. * @param fmt The formatter to query. * @param symbol The UNumberFormatSymbol constant for the symbol to get * @param buffer The string buffer that will receive the symbol string; * if it is NULL, then only the length of the symbol is returned * @param size The size of the string buffer * @param status A pointer to an UErrorCode to receive any errors * @return The length of the symbol; the buffer is not modified if * length>=size * @see unum_setSymbol * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 unum_getSymbol(const UNumberFormat *fmt, UNumberFormatSymbol symbol, UChar *buffer, int32_t size, UErrorCode *status); /** * Set a symbol associated with a UNumberFormat. * A UNumberFormat uses symbols to represent the special locale-dependent * characters in a number, for example the percent sign. This API is not * supported for rule-based formatters. * @param fmt The formatter to set. * @param symbol The UNumberFormatSymbol constant for the symbol to set * @param value The string to set the symbol to * @param length The length of the string, or -1 for a zero-terminated string * @param status A pointer to an UErrorCode to receive any errors. * @see unum_getSymbol * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 unum_setSymbol(UNumberFormat *fmt, UNumberFormatSymbol symbol, const UChar *value, int32_t length, UErrorCode *status); /** * Get the locale for this number format object. * You can choose between valid and actual locale. * @param fmt The formatter to get the locale from * @param type type of the locale we're looking for (valid or actual) * @param status error code for the operation * @return the locale name * @stable ICU 2.8 */ U_CAPI const char* U_EXPORT2 unum_getLocaleByType(const UNumberFormat *fmt, ULocDataLocaleType type, UErrorCode* status); /** * Set a particular UDisplayContext value in the formatter, such as * UDISPCTX_CAPITALIZATION_FOR_STANDALONE. * @param fmt The formatter for which to set a UDisplayContext value. * @param value The UDisplayContext value to set. * @param status A pointer to an UErrorCode to receive any errors * @stable ICU 53 */ U_CAPI void U_EXPORT2 unum_setContext(UNumberFormat* fmt, UDisplayContext value, UErrorCode* status); /** * Get the formatter's UDisplayContext value for the specified UDisplayContextType, * such as UDISPCTX_TYPE_CAPITALIZATION. * @param fmt The formatter to query. * @param type The UDisplayContextType whose value to return * @param status A pointer to an UErrorCode to receive any errors * @return The UDisplayContextValue for the specified type. * @stable ICU 53 */ U_CAPI UDisplayContext U_EXPORT2 unum_getContext(const UNumberFormat *fmt, UDisplayContextType type, UErrorCode* status); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) #if (NTDDI_VERSION >= NTDDI_WIN10_VB) // unumberformatter.h // Copyright (C) 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #ifndef __UNUMBERFORMATTER_H__ #define __UNUMBERFORMATTER_H__ #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Localized number formatting; not recommended for C++. * * This is the C-compatible version of the NumberFormatter API introduced in ICU 60. C++ users should * include unicode/numberformatter.h and use the proper C++ APIs. * * The C API accepts a number skeleton string for specifying the settings for formatting, which covers a * very large subset of all possible number formatting features. For more information on number skeleton * strings, see unicode/numberformatter.h. * * When using UNumberFormatter, which is treated as immutable, the results are exported to a mutable * UFormattedNumber object, which you subsequently use for populating your string buffer or iterating over * the fields. * * Example code: *

 * // Setup:
 * UErrorCode ec = U_ZERO_ERROR;
 * UNumberFormatter* uformatter = unumf_openForSkeletonAndLocale(u"precision-integer", -1, "en", &ec);
 * UFormattedNumber* uresult = unumf_openResult(&ec);
 * if (U_FAILURE(ec)) { return; }
 *
 * // Format a double:
 * unumf_formatDouble(uformatter, 5142.3, uresult, &ec);
 * if (U_FAILURE(ec)) { return; }
 *
 * // Export the string to a malloc'd buffer:
 * int32_t len = unumf_resultToString(uresult, NULL, 0, &ec);
 * // at this point, ec == U_BUFFER_OVERFLOW_ERROR
 * ec = U_ZERO_ERROR;
 * UChar* buffer = (UChar*) malloc((len+1)*sizeof(UChar));
 * unumf_resultToString(uresult, buffer, len+1, &ec);
 * if (U_FAILURE(ec)) { return; }
 * // buffer should equal "5,142"
 *
 * // Cleanup:
 * unumf_close(uformatter);
 * unumf_closeResult(uresult);
 * free(buffer);
 * 
* * If you are a C++ user linking against the C libraries, you can use the LocalPointer versions of these * APIs. The following example uses LocalPointer with the decimal number and field position APIs: * *
 * // Setup:
 * LocalUNumberFormatterPointer uformatter(unumf_openForSkeletonAndLocale(u"percent", -1, "en", &ec));
 * LocalUFormattedNumberPointer uresult(unumf_openResult(&ec));
 * if (U_FAILURE(ec)) { return; }
 *
 * // Format a decimal number:
 * unumf_formatDecimal(uformatter.getAlias(), "9.87E-3", -1, uresult.getAlias(), &ec);
 * if (U_FAILURE(ec)) { return; }
 *
 * // Get the location of the percent sign:
 * UFieldPosition ufpos = {UNUM_PERCENT_FIELD, 0, 0};
 * unumf_resultNextFieldPosition(uresult.getAlias(), &ufpos, &ec);
 * // ufpos should contain beginIndex=7 and endIndex=8 since the string is "0.00987%"
 *
 * // No need to do any cleanup since we are using LocalPointer.
 * 
*/ #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * An enum declaring how to render units, including currencies. Example outputs when formatting 123 USD and 123 * meters in en-CA: * *

*

    *
  • NARROW*: "$123.00" and "123 m" *
  • SHORT: "US$ 123.00" and "123 m" *
  • FULL_NAME: "123.00 US dollars" and "123 meters" *
  • ISO_CODE: "USD 123.00" and undefined behavior *
  • HIDDEN: "123.00" and "123" *
* *

* This enum is similar to {@link UMeasureFormatWidth}. * * @stable ICU 60 */ typedef enum UNumberUnitWidth { /** * Print an abbreviated version of the unit name. Similar to SHORT, but always use the shortest available * abbreviation or symbol. This option can be used when the context hints at the identity of the unit. For more * information on the difference between NARROW and SHORT, see SHORT. * *

* In CLDR, this option corresponds to the "Narrow" format for measure units and the "¤¤¤¤¤" placeholder for * currencies. * * @stable ICU 60 */ UNUM_UNIT_WIDTH_NARROW = 0, /** * Print an abbreviated version of the unit name. Similar to NARROW, but use a slightly wider abbreviation or * symbol when there may be ambiguity. This is the default behavior. * *

* For example, in es-US, the SHORT form for Fahrenheit is "{0} °F", but the NARROW form is "{0}°", * since Fahrenheit is the customary unit for temperature in that locale. * *

* In CLDR, this option corresponds to the "Short" format for measure units and the "¤" placeholder for * currencies. * * @stable ICU 60 */ UNUM_UNIT_WIDTH_SHORT = 1, /** * Print the full name of the unit, without any abbreviations. * *

* In CLDR, this option corresponds to the default format for measure units and the "¤¤¤" placeholder for * currencies. * * @stable ICU 60 */ UNUM_UNIT_WIDTH_FULL_NAME = 2, /** * Use the three-digit ISO XXX code in place of the symbol for displaying currencies. The behavior of this * option is currently undefined for use with measure units. * *

* In CLDR, this option corresponds to the "¤¤" placeholder for currencies. * * @stable ICU 60 */ UNUM_UNIT_WIDTH_ISO_CODE = 3, #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Use the formal variant of the currency symbol; for example, "NT$" for the New Taiwan * dollar in zh-TW. * *

* Behavior of this option with non-currency units is not defined at this time. * * @stable ICU 68 */ UNUM_UNIT_WIDTH_FORMAL = 4, /** * Use the alternate variant of the currency symbol; for example, "TL" for the Turkish * lira (TRY). * *

* Behavior of this option with non-currency units is not defined at this time. * * @stable ICU 68 */ UNUM_UNIT_WIDTH_VARIANT = 5, #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Format the number according to the specified unit, but do not display the unit. For currencies, apply * monetary symbols and formats as with SHORT, but omit the currency symbol. For measure units, the behavior is * equivalent to not specifying the unit at all. * * @stable ICU 60 */ UNUM_UNIT_WIDTH_HIDDEN = 6, // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, // needed for unconditionalized struct MacroProps /** * One more than the highest UNumberUnitWidth value. * * @internal ICU 60: The numeric value may change over time; see ICU ticket #12420. */ UNUM_UNIT_WIDTH_COUNT = 7 } UNumberUnitWidth; /** * An enum declaring the strategy for when and how to display grouping separators (i.e., the * separator, often a comma or period, after every 2-3 powers of ten). The choices are several * pre-built strategies for different use cases that employ locale data whenever possible. Example * outputs for 1234 and 1234567 in en-IN: * *

    *
  • OFF: 1234 and 12345 *
  • MIN2: 1234 and 12,34,567 *
  • AUTO: 1,234 and 12,34,567 *
  • ON_ALIGNED: 1,234 and 12,34,567 *
  • THOUSANDS: 1,234 and 1,234,567 *
* *

* The default is AUTO, which displays grouping separators unless the locale data says that grouping * is not customary. To force grouping for all numbers greater than 1000 consistently across locales, * use ON_ALIGNED. On the other hand, to display grouping less frequently than the default, use MIN2 * or OFF. See the docs of each option for details. * *

* Note: This enum specifies the strategy for grouping sizes. To set which character to use as the * grouping separator, use the "symbols" setter. * * @stable ICU 63 */ typedef enum UNumberGroupingStrategy { /** * Do not display grouping separators in any locale. * * @stable ICU 61 */ UNUM_GROUPING_OFF, /** * Display grouping using locale defaults, except do not show grouping on values smaller than * 10000 (such that there is a minimum of two digits before the first separator). * *

* Note that locales may restrict grouping separators to be displayed only on 1 million or * greater (for example, ee and hu) or disable grouping altogether (for example, bg currency). * *

* Locale data is used to determine whether to separate larger numbers into groups of 2 * (customary in South Asia) or groups of 3 (customary in Europe and the Americas). * * @stable ICU 61 */ UNUM_GROUPING_MIN2, /** * Display grouping using the default strategy for all locales. This is the default behavior. * *

* Note that locales may restrict grouping separators to be displayed only on 1 million or * greater (for example, ee and hu) or disable grouping altogether (for example, bg currency). * *

* Locale data is used to determine whether to separate larger numbers into groups of 2 * (customary in South Asia) or groups of 3 (customary in Europe and the Americas). * * @stable ICU 61 */ UNUM_GROUPING_AUTO, /** * Always display the grouping separator on values of at least 1000. * *

* This option ignores the locale data that restricts or disables grouping, described in MIN2 and * AUTO. This option may be useful to normalize the alignment of numbers, such as in a * spreadsheet. * *

* Locale data is used to determine whether to separate larger numbers into groups of 2 * (customary in South Asia) or groups of 3 (customary in Europe and the Americas). * * @stable ICU 61 */ UNUM_GROUPING_ON_ALIGNED, /** * Use the Western defaults: groups of 3 and enabled for all numbers 1000 or greater. Do not use * locale data for determining the grouping strategy. * * @stable ICU 61 */ UNUM_GROUPING_THOUSANDS } UNumberGroupingStrategy; /** * An enum declaring how to denote positive and negative numbers. Example outputs when formatting * 123, 0, and -123 in en-US: * *

    *
  • AUTO: "123", "0", and "-123" *
  • ALWAYS: "+123", "+0", and "-123" *
  • NEVER: "123", "0", and "123" *
  • ACCOUNTING: "$123", "$0", and "($123)" *
  • ACCOUNTING_ALWAYS: "+$123", "+$0", and "($123)" *
  • EXCEPT_ZERO: "+123", "0", and "-123" *
  • ACCOUNTING_EXCEPT_ZERO: "+$123", "$0", and "($123)" *
* *

* The exact format, including the position and the code point of the sign, differ by locale. * * @stable ICU 60 */ typedef enum UNumberSignDisplay { /** * Show the minus sign on negative numbers, and do not show the sign on positive numbers. This is the default * behavior. * * If using this option, a sign will be displayed on negative zero, including negative numbers * that round to zero. To hide the sign on negative zero, use the NEGATIVE option. * * @stable ICU 60 */ UNUM_SIGN_AUTO, /** * Show the minus sign on negative numbers and the plus sign on positive numbers, including zero. * To hide the sign on zero, see {@link UNUM_SIGN_EXCEPT_ZERO}. * * @stable ICU 60 */ UNUM_SIGN_ALWAYS, /** * Do not show the sign on positive or negative numbers. * * @stable ICU 60 */ UNUM_SIGN_NEVER, /** * Use the locale-dependent accounting format on negative numbers, and do not show the sign on positive numbers. * *

* The accounting format is defined in CLDR and varies by locale; in many Western locales, the format is a pair * of parentheses around the number. * *

* Note: Since CLDR defines the accounting format in the monetary context only, this option falls back to the * AUTO sign display strategy when formatting without a currency unit. This limitation may be lifted in the * future. * * @stable ICU 60 */ UNUM_SIGN_ACCOUNTING, /** * Use the locale-dependent accounting format on negative numbers, and show the plus sign on * positive numbers, including zero. For more information on the accounting format, see the * ACCOUNTING sign display strategy. To hide the sign on zero, see * {@link UNUM_SIGN_ACCOUNTING_EXCEPT_ZERO}. * * @stable ICU 60 */ UNUM_SIGN_ACCOUNTING_ALWAYS, /** * Show the minus sign on negative numbers and the plus sign on positive numbers. Do not show a * sign on zero, numbers that round to zero, or NaN. * * @stable ICU 61 */ UNUM_SIGN_EXCEPT_ZERO, /** * Use the locale-dependent accounting format on negative numbers, and show the plus sign on * positive numbers. Do not show a sign on zero, numbers that round to zero, or NaN. For more * information on the accounting format, see the ACCOUNTING sign display strategy. * * @stable ICU 61 */ UNUM_SIGN_ACCOUNTING_EXCEPT_ZERO, /** * Same as AUTO, but do not show the sign on negative zero. * * @stable ICU 69 */ UNUM_SIGN_NEGATIVE, /** * Same as ACCOUNTING, but do not show the sign on negative zero. * * @stable ICU 69 */ UNUM_SIGN_ACCOUNTING_NEGATIVE, // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, // needed for unconditionalized struct MacroProps /** * One more than the highest UNumberSignDisplay value. * * @internal ICU 60: The numeric value may change over time; see ICU ticket #12420. */ UNUM_SIGN_COUNT = 9, } UNumberSignDisplay; /** * An enum declaring how to render the decimal separator. * *

*

    *
  • UNUM_DECIMAL_SEPARATOR_AUTO: "1", "1.1" *
  • UNUM_DECIMAL_SEPARATOR_ALWAYS: "1.", "1.1" *
* * @stable ICU 60 */ typedef enum UNumberDecimalSeparatorDisplay { /** * Show the decimal separator when there are one or more digits to display after the separator, and do not show * it otherwise. This is the default behavior. * * @stable ICU 60 */ UNUM_DECIMAL_SEPARATOR_AUTO, /** * Always show the decimal separator, even if there are no digits to display after the separator. * * @stable ICU 60 */ UNUM_DECIMAL_SEPARATOR_ALWAYS, // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, // needed for unconditionalized struct MacroProps /** * One more than the highest UNumberDecimalSeparatorDisplay value. * * @internal ICU 60: The numeric value may change over time; see ICU ticket #12420. */ UNUM_DECIMAL_SEPARATOR_COUNT } UNumberDecimalSeparatorDisplay; #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * An enum declaring how to render trailing zeros. * * - UNUM_TRAILING_ZERO_AUTO: 0.90, 1.00, 1.10 * - UNUM_TRAILING_ZERO_HIDE_IF_WHOLE: 0.90, 1, 1.10 * * @stable ICU 69 */ typedef enum UNumberTrailingZeroDisplay { /** * Display trailing zeros according to the settings for minimum fraction and significant digits. * * @stable ICU 69 */ UNUM_TRAILING_ZERO_AUTO, /** * Same as AUTO, but hide trailing zeros after the decimal separator if they are all zero. * * @stable ICU 69 */ UNUM_TRAILING_ZERO_HIDE_IF_WHOLE, } UNumberTrailingZeroDisplay; #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) struct UNumberFormatter; /** * C-compatible version of icu::number::LocalizedNumberFormatter. * * NOTE: This is a C-compatible API; C++ users should build against numberformatter.h instead. * * @stable ICU 62 */ typedef struct UNumberFormatter UNumberFormatter; struct UFormattedNumber; /** * C-compatible version of icu::number::FormattedNumber. * * NOTE: This is a C-compatible API; C++ users should build against numberformatter.h instead. * * @stable ICU 62 */ typedef struct UFormattedNumber UFormattedNumber; /** * Creates a new UNumberFormatter for the given skeleton string and locale. This is currently the only * method for creating a new UNumberFormatter. * * Objects of type UNumberFormatter returned by this method are threadsafe. * * For more details on skeleton strings, see the documentation in numberformatter.h. For more details on * the usage of this API, see the documentation at the top of unumberformatter.h. * * For more information on number skeleton strings, see: * https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html * * NOTE: This is a C-compatible API; C++ users should build against numberformatter.h instead. * * @param skeleton The skeleton string, like u"percent precision-integer" * @param skeletonLen The number of UChars in the skeleton string, or -1 if it is NUL-terminated. * @param locale The NUL-terminated locale ID. * @param ec Set if an error occurs. * @stable ICU 62 */ U_CAPI UNumberFormatter* U_EXPORT2 unumf_openForSkeletonAndLocale(const UChar* skeleton, int32_t skeletonLen, const char* locale, UErrorCode* ec); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Like unumf_openForSkeletonAndLocale, but accepts a UParseError, which will be populated with the * location of a skeleton syntax error if such a syntax error exists. * * For more information on number skeleton strings, see: * https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html * * @param skeleton The skeleton string, like u"percent precision-integer" * @param skeletonLen The number of UChars in the skeleton string, or -1 if it is NUL-terminated. * @param locale The NUL-terminated locale ID. * @param perror A parse error struct populated if an error occurs when parsing. Can be NULL. * If no error occurs, perror->offset will be set to -1. * @param ec Set if an error occurs. * @stable ICU 64 */ U_CAPI UNumberFormatter* U_EXPORT2 unumf_openForSkeletonAndLocaleWithError( const UChar* skeleton, int32_t skeletonLen, const char* locale, UParseError* perror, UErrorCode* ec); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Creates an object to hold the result of a UNumberFormatter * operation. The object can be used repeatedly; it is cleared whenever * passed to a format function. * * @param ec Set if an error occurs. * @stable ICU 62 */ U_CAPI UFormattedNumber* U_EXPORT2 unumf_openResult(UErrorCode* ec); /** * Uses a UNumberFormatter to format an integer to a UFormattedNumber. A string, field position, and other * information can be retrieved from the UFormattedNumber. * * The UNumberFormatter can be shared between threads. Each thread should have its own local * UFormattedNumber, however, for storing the result of the formatting operation. * * NOTE: This is a C-compatible API; C++ users should build against numberformatter.h instead. * * @param uformatter A formatter object created by unumf_openForSkeletonAndLocale or similar. * @param value The number to be formatted. * @param uresult The object that will be mutated to store the result; see unumf_openResult. * @param ec Set if an error occurs. * @stable ICU 62 */ U_CAPI void U_EXPORT2 unumf_formatInt(const UNumberFormatter* uformatter, int64_t value, UFormattedNumber* uresult, UErrorCode* ec); /** * Uses a UNumberFormatter to format a double to a UFormattedNumber. A string, field position, and other * information can be retrieved from the UFormattedNumber. * * The UNumberFormatter can be shared between threads. Each thread should have its own local * UFormattedNumber, however, for storing the result of the formatting operation. * * NOTE: This is a C-compatible API; C++ users should build against numberformatter.h instead. * * @param uformatter A formatter object created by unumf_openForSkeletonAndLocale or similar. * @param value The number to be formatted. * @param uresult The object that will be mutated to store the result; see unumf_openResult. * @param ec Set if an error occurs. * @stable ICU 62 */ U_CAPI void U_EXPORT2 unumf_formatDouble(const UNumberFormatter* uformatter, double value, UFormattedNumber* uresult, UErrorCode* ec); /** * Uses a UNumberFormatter to format a decimal number to a UFormattedNumber. A string, field position, and * other information can be retrieved from the UFormattedNumber. * * The UNumberFormatter can be shared between threads. Each thread should have its own local * UFormattedNumber, however, for storing the result of the formatting operation. * * The syntax of the unformatted number is a "numeric string" as defined in the Decimal Arithmetic * Specification, available at http://speleotrove.com/decimal * * NOTE: This is a C-compatible API; C++ users should build against numberformatter.h instead. * * @param uformatter A formatter object created by unumf_openForSkeletonAndLocale or similar. * @param value The numeric string to be formatted. * @param valueLen The length of the numeric string, or -1 if it is NUL-terminated. * @param uresult The object that will be mutated to store the result; see unumf_openResult. * @param ec Set if an error occurs. * @stable ICU 62 */ U_CAPI void U_EXPORT2 unumf_formatDecimal(const UNumberFormatter* uformatter, const char* value, int32_t valueLen, UFormattedNumber* uresult, UErrorCode* ec); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Returns a representation of a UFormattedNumber as a UFormattedValue, * which can be subsequently passed to any API requiring that type. * * The returned object is owned by the UFormattedNumber and is valid * only as long as the UFormattedNumber is present and unchanged in memory. * * You can think of this method as a cast between types. * * @param uresult The object containing the formatted string. * @param ec Set if an error occurs. * @return A UFormattedValue owned by the input object. * @stable ICU 64 */ U_CAPI const UFormattedValue* U_EXPORT2 unumf_resultAsValue(const UFormattedNumber* uresult, UErrorCode* ec); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Extracts the result number string out of a UFormattedNumber to a UChar buffer if possible. * If bufferCapacity is greater than the required length, a terminating NUL is written. * If bufferCapacity is less than the required length, an error code is set. * * Also see ufmtval_getString, which returns a NUL-terminated string: * * int32_t len; * const UChar* str = ufmtval_getString(unumf_resultAsValue(uresult, &ec), &len, &ec); * * NOTE: This is a C-compatible API; C++ users should build against numberformatter.h instead. * * @param uresult The object containing the formatted number. * @param buffer Where to save the string output. * @param bufferCapacity The number of UChars available in the buffer. * @param ec Set if an error occurs. * @return The required length. * @stable ICU 62 */ U_CAPI int32_t U_EXPORT2 unumf_resultToString(const UFormattedNumber* uresult, UChar* buffer, int32_t bufferCapacity, UErrorCode* ec); /** * Determines the start and end indices of the next occurrence of the given field in the * output string. This allows you to determine the locations of, for example, the integer part, * fraction part, or symbols. * * This is a simpler but less powerful alternative to {@link ufmtval_nextPosition}. * * If a field occurs just once, calling this method will find that occurrence and return it. If a * field occurs multiple times, this method may be called repeatedly with the following pattern: * *
 * UFieldPosition ufpos = {UNUM_GROUPING_SEPARATOR_FIELD, 0, 0};
 * while (unumf_resultNextFieldPosition(uresult, ufpos, &ec)) {
 *   // do something with ufpos.
 * }
 * 
* * This method is useful if you know which field to query. If you want all available field position * information, use unumf_resultGetAllFieldPositions(). * * NOTE: All fields of the UFieldPosition must be initialized before calling this method. * * @param uresult The object containing the formatted number. * @param ufpos * Input+output variable. On input, the "field" property determines which field to look up, * and the "endIndex" property determines where to begin the search. On output, the * "beginIndex" field is set to the beginning of the first occurrence of the field after the * input "endIndex", and "endIndex" is set to the end of that occurrence of the field * (exclusive index). If a field position is not found, the FieldPosition is not changed and * the method returns false. * @param ec Set if an error occurs. * @stable ICU 62 */ U_CAPI UBool U_EXPORT2 unumf_resultNextFieldPosition(const UFormattedNumber* uresult, UFieldPosition* ufpos, UErrorCode* ec); /** * Populates the given iterator with all fields in the formatted output string. This allows you to * determine the locations of the integer part, fraction part, and sign. * * This is an alternative to the more powerful {@link ufmtval_nextPosition} API. * * If you need information on only one field, use {@link ufmtval_nextPosition} or * {@link unumf_resultNextFieldPosition}. * * @param uresult The object containing the formatted number. * @param ufpositer * A pointer to a UFieldPositionIterator created by {@link #ufieldpositer_open}. Iteration * information already present in the UFieldPositionIterator is deleted, and the iterator is reset * to apply to the fields in the formatted string created by this function call. The field values * and indexes returned by {@link #ufieldpositer_next} represent fields denoted by * the UNumberFormatFields enum. Fields are not returned in a guaranteed order. Fields cannot * overlap, but they may nest. For example, 1234 could format as "1,234" which might consist of a * grouping separator field for ',' and an integer field encompassing the entire string. * @param ec Set if an error occurs. * @stable ICU 62 */ U_CAPI void U_EXPORT2 unumf_resultGetAllFieldPositions(const UFormattedNumber* uresult, UFieldPositionIterator* ufpositer, UErrorCode* ec); #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Extracts the formatted number as a "numeric string" conforming to the * syntax defined in the Decimal Arithmetic Specification, available at * http://speleotrove.com/decimal * * This endpoint is useful for obtaining the exact number being printed * after scaling and rounding have been applied by the number formatter. * * @param uresult The input object containing the formatted number. * @param dest the 8-bit char buffer into which the decimal number is placed * @param destCapacity The size, in chars, of the destination buffer. May be zero * for precomputing the required size. * @param ec receives any error status. * If U_BUFFER_OVERFLOW_ERROR: Returns number of chars for * preflighting. * @return Number of chars in the data. Does not include a trailing NUL. * @stable ICU 68 */ U_CAPI int32_t U_EXPORT2 unumf_resultToDecimalNumber( const UFormattedNumber* uresult, char* dest, int32_t destCapacity, UErrorCode* ec); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) /** * Releases the UNumberFormatter created by unumf_openForSkeletonAndLocale(). * * @param uformatter An object created by unumf_openForSkeletonAndLocale(). * @stable ICU 62 */ U_CAPI void U_EXPORT2 unumf_close(UNumberFormatter* uformatter); /** * Releases the UFormattedNumber created by unumf_openResult(). * * @param uresult An object created by unumf_openResult(). * @stable ICU 62 */ U_CAPI void U_EXPORT2 unumf_closeResult(UFormattedNumber* uresult); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif //__UNUMBERFORMATTER_H__ #endif // (NTDDI_VERSION >= NTDDI_WIN10_VB) #if (NTDDI_VERSION >= NTDDI_WIN10_CO) // unumberrangeformatter.h // Copyright (C) 2020 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #ifndef __UNUMBERRANGEFORMATTER_H__ #define __UNUMBERRANGEFORMATTER_H__ #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Localized number range formatting * * This is the C-compatible version of the NumberRangeFormatter API. C++ users * should include unicode/numberrangeformatter.h and use the proper C++ APIs. * * First create a UNumberRangeFormatter, which is immutable, and then format to * a UFormattedNumberRange. * * Example code: *
 * // Setup:
 * UErrorCode ec = U_ZERO_ERROR;
 * UNumberRangeFormatter* uformatter = unumrf_openForSkeletonCollapseIdentityFallbackAndLocaleWithError(
 *     u"currency/USD precision-integer",
 *     -1,
 *     UNUM_RANGE_COLLAPSE_AUTO,
 *     UNUM_IDENTITY_FALLBACK_APPROXIMATELY,
 *     "en-US",
 *     NULL,
 *     &ec);
 * UFormattedNumberRange* uresult = unumrf_openResult(&ec);
 * if (U_FAILURE(ec)) { return; }
 *
 * // Format a double range:
 * unumrf_formatDoubleRange(uformatter, 3.0, 5.0, uresult, &ec);
 * if (U_FAILURE(ec)) { return; }
 *
 * // Get the result string:
 * int32_t len;
 * const UChar* str = ufmtval_getString(unumrf_resultAsValue(uresult, &ec), &len, &ec);
 * if (U_FAILURE(ec)) { return; }
 * // str should equal "$3 – $5"
 *
 * // Cleanup:
 * unumf_close(uformatter);
 * unumf_closeResult(uresult);
 * 
* * If you are a C++ user linking against the C libraries, you can use the LocalPointer versions of these * APIs. The following example uses LocalPointer with the decimal number and field position APIs: * *
 * // Setup:
 * LocalUNumberRangeFormatterPointer uformatter(
 *     unumrf_openForSkeletonCollapseIdentityFallbackAndLocaleWithError(...));
 * LocalUFormattedNumberRangePointer uresult(unumrf_openResult(&ec));
 * if (U_FAILURE(ec)) { return; }
 *
 * // Format a double number range:
 * unumrf_formatDoubleRange(uformatter.getAlias(), 3.0, 5.0, uresult.getAlias(), &ec);
 * if (U_FAILURE(ec)) { return; }
 *
 * // No need to do any cleanup since we are using LocalPointer.
 * 
* * You can also get field positions. For more information, see uformattedvalue.h. */ /** * Defines how to merge fields that are identical across the range sign. * * @stable ICU 63 */ typedef enum UNumberRangeCollapse { /** * Use locale data and heuristics to determine how much of the string to collapse. Could end up collapsing none, * some, or all repeated pieces in a locale-sensitive way. * * The heuristics used for this option are subject to change over time. * * @stable ICU 63 */ UNUM_RANGE_COLLAPSE_AUTO, /** * Do not collapse any part of the number. Example: "3.2 thousand kilograms – 5.3 thousand kilograms" * * @stable ICU 63 */ UNUM_RANGE_COLLAPSE_NONE, /** * Collapse the unit part of the number, but not the notation, if present. Example: "3.2 thousand – 5.3 thousand * kilograms" * * @stable ICU 63 */ UNUM_RANGE_COLLAPSE_UNIT, /** * Collapse any field that is equal across the range sign. May introduce ambiguity on the magnitude of the * number. Example: "3.2 – 5.3 thousand kilograms" * * @stable ICU 63 */ UNUM_RANGE_COLLAPSE_ALL } UNumberRangeCollapse; /** * Defines the behavior when the two numbers in the range are identical after rounding. To programmatically detect * when the identity fallback is used, compare the lower and upper BigDecimals via FormattedNumber. * * @stable ICU 63 * @see NumberRangeFormatter */ typedef enum UNumberRangeIdentityFallback { /** * Show the number as a single value rather than a range. Example: "$5" * * @stable ICU 63 */ UNUM_IDENTITY_FALLBACK_SINGLE_VALUE, /** * Show the number using a locale-sensitive approximation pattern. If the numbers were the same before rounding, * show the single value. Example: "~$5" or "$5" * * @stable ICU 63 */ UNUM_IDENTITY_FALLBACK_APPROXIMATELY_OR_SINGLE_VALUE, /** * Show the number using a locale-sensitive approximation pattern. Use the range pattern always, even if the * inputs are the same. Example: "~$5" * * @stable ICU 63 */ UNUM_IDENTITY_FALLBACK_APPROXIMATELY, /** * Show the number as the range of two equal values. Use the range pattern always, even if the inputs are the * same. Example (with RangeCollapse.NONE): "$5 – $5" * * @stable ICU 63 */ UNUM_IDENTITY_FALLBACK_RANGE } UNumberRangeIdentityFallback; /** * Used in the result class FormattedNumberRange to indicate to the user whether the numbers formatted in the range * were equal or not, and whether or not the identity fallback was applied. * * @stable ICU 63 * @see NumberRangeFormatter */ typedef enum UNumberRangeIdentityResult { /** * Used to indicate that the two numbers in the range were equal, even before any rounding rules were applied. * * @stable ICU 63 * @see NumberRangeFormatter */ UNUM_IDENTITY_RESULT_EQUAL_BEFORE_ROUNDING, /** * Used to indicate that the two numbers in the range were equal, but only after rounding rules were applied. * * @stable ICU 63 * @see NumberRangeFormatter */ UNUM_IDENTITY_RESULT_EQUAL_AFTER_ROUNDING, /** * Used to indicate that the two numbers in the range were not equal, even after rounding rules were applied. * * @stable ICU 63 * @see NumberRangeFormatter */ UNUM_IDENTITY_RESULT_NOT_EQUAL, } UNumberRangeIdentityResult; #if (NTDDI_VERSION >= NTDDI_WIN11_ZN) struct UNumberRangeFormatter; /** * C-compatible version of icu::number::LocalizedNumberRangeFormatter. * * NOTE: This is a C-compatible API; C++ users should build against numberrangeformatter.h instead. * * @stable ICU 68 */ typedef struct UNumberRangeFormatter UNumberRangeFormatter; struct UFormattedNumberRange; /** * C-compatible version of icu::number::FormattedNumberRange. * * NOTE: This is a C-compatible API; C++ users should build against numberrangeformatter.h instead. * * @stable ICU 68 */ typedef struct UFormattedNumberRange UFormattedNumberRange; /** * Creates a new UNumberFormatter for the given skeleton string, collapse option, identity fallback * option, and locale. This is currently the only method for creating a new UNumberRangeFormatter. * * Objects of type UNumberRangeFormatter returned by this method are threadsafe. * * For more details on skeleton strings, see the documentation in numberrangeformatter.h. For more * details on the usage of this API, see the documentation at the top of unumberrangeformatter.h. * * NOTE: This is a C-compatible API; C++ users should build against numberrangeformatter.h instead. * * @param skeleton The skeleton string, like u"percent precision-integer" * @param skeletonLen The number of UChars in the skeleton string, or -1 if it is NUL-terminated. * @param collapse Option for how to merge affixes (if unsure, use UNUM_RANGE_COLLAPSE_AUTO) * @param identityFallback Option for resolving when both sides of the range are equal. * @param locale The NUL-terminated locale ID. * @param perror A parse error struct populated if an error occurs when parsing. Can be NULL. * If no error occurs, perror->offset will be set to -1. * @param ec Set if an error occurs. * @stable ICU 68 */ U_CAPI UNumberRangeFormatter* U_EXPORT2 unumrf_openForSkeletonWithCollapseAndIdentityFallback( const UChar* skeleton, int32_t skeletonLen, UNumberRangeCollapse collapse, UNumberRangeIdentityFallback identityFallback, const char* locale, UParseError* perror, UErrorCode* ec); /** * Creates an object to hold the result of a UNumberRangeFormatter * operation. The object can be used repeatedly; it is cleared whenever * passed to a format function. * * @param ec Set if an error occurs. * @stable ICU 68 */ U_CAPI UFormattedNumberRange* U_EXPORT2 unumrf_openResult(UErrorCode* ec); /** * Uses a UNumberRangeFormatter to format a range of doubles. * * The UNumberRangeFormatter can be shared between threads. Each thread should have its own local * UFormattedNumberRange, however, for storing the result of the formatting operation. * * NOTE: This is a C-compatible API; C++ users should build against numberrangeformatter.h instead. * * @param uformatter A formatter object; see unumberrangeformatter.h. * @param first The first (usually smaller) number in the range. * @param second The second (usually larger) number in the range. * @param uresult The object that will be mutated to store the result; see unumrf_openResult. * @param ec Set if an error occurs. * @stable ICU 68 */ U_CAPI void U_EXPORT2 unumrf_formatDoubleRange( const UNumberRangeFormatter* uformatter, double first, double second, UFormattedNumberRange* uresult, UErrorCode* ec); /** * Uses a UNumberRangeFormatter to format a range of decimal numbers. * * With a decimal number string, you can specify an input with arbitrary precision. * * The UNumberRangeFormatter can be shared between threads. Each thread should have its own local * UFormattedNumberRange, however, for storing the result of the formatting operation. * * NOTE: This is a C-compatible API; C++ users should build against numberrangeformatter.h instead. * * @param uformatter A formatter object; see unumberrangeformatter.h. * @param first The first (usually smaller) number in the range. * @param firstLen The length of the first decimal number string. * @param second The second (usually larger) number in the range. * @param secondLen The length of the second decimal number string. * @param uresult The object that will be mutated to store the result; see unumrf_openResult. * @param ec Set if an error occurs. * @stable ICU 68 */ U_CAPI void U_EXPORT2 unumrf_formatDecimalRange( const UNumberRangeFormatter* uformatter, const char* first, int32_t firstLen, const char* second, int32_t secondLen, UFormattedNumberRange* uresult, UErrorCode* ec); /** * Returns a representation of a UFormattedNumberRange as a UFormattedValue, * which can be subsequently passed to any API requiring that type. * * The returned object is owned by the UFormattedNumberRange and is valid * only as long as the UFormattedNumber is present and unchanged in memory. * * You can think of this method as a cast between types. * * @param uresult The object containing the formatted number range. * @param ec Set if an error occurs. * @return A UFormattedValue owned by the input object. * @stable ICU 68 */ U_CAPI const UFormattedValue* U_EXPORT2 unumrf_resultAsValue(const UFormattedNumberRange* uresult, UErrorCode* ec); /** * Extracts the identity result from a UFormattedNumberRange. * * NOTE: This is a C-compatible API; C++ users should build against numberformatter.h instead. * * @param uresult The object containing the formatted number range. * @param ec Set if an error occurs. * @return The identity result; see UNumberRangeIdentityResult. * @stable ICU 68 */ U_CAPI UNumberRangeIdentityResult U_EXPORT2 unumrf_resultGetIdentityResult( const UFormattedNumberRange* uresult, UErrorCode* ec); /** * Extracts the first formatted number as a decimal number. This endpoint * is useful for obtaining the exact number being printed after scaling * and rounding have been applied by the number range formatting pipeline. * * The syntax of the unformatted number is a "numeric string" * as defined in the Decimal Arithmetic Specification, available at * http://speleotrove.com/decimal * * @param uresult The input object containing the formatted number range. * @param dest the 8-bit char buffer into which the decimal number is placed * @param destCapacity The size, in chars, of the destination buffer. May be zero * for precomputing the required size. * @param ec receives any error status. * If U_BUFFER_OVERFLOW_ERROR: Returns number of chars for * preflighting. * @return Number of chars in the data. Does not include a trailing NUL. * @stable ICU 68 */ U_CAPI int32_t U_EXPORT2 unumrf_resultGetFirstDecimalNumber( const UFormattedNumberRange* uresult, char* dest, int32_t destCapacity, UErrorCode* ec); /** * Extracts the second formatted number as a decimal number. This endpoint * is useful for obtaining the exact number being printed after scaling * and rounding have been applied by the number range formatting pipeline. * * The syntax of the unformatted number is a "numeric string" * as defined in the Decimal Arithmetic Specification, available at * http://speleotrove.com/decimal * * @param uresult The input object containing the formatted number range. * @param dest the 8-bit char buffer into which the decimal number is placed * @param destCapacity The size, in chars, of the destination buffer. May be zero * for precomputing the required size. * @param ec receives any error status. * If U_BUFFER_OVERFLOW_ERROR: Returns number of chars for * preflighting. * @return Number of chars in the data. Does not include a trailing NUL. * @stable ICU 68 */ U_CAPI int32_t U_EXPORT2 unumrf_resultGetSecondDecimalNumber( const UFormattedNumberRange* uresult, char* dest, int32_t destCapacity, UErrorCode* ec); /** * Releases the UNumberFormatter created by unumf_openForSkeletonAndLocale(). * * @param uformatter An object created by unumf_openForSkeletonAndLocale(). * @stable ICU 68 */ U_CAPI void U_EXPORT2 unumrf_close(UNumberRangeFormatter* uformatter); /** * Releases the UFormattedNumber created by unumf_openResult(). * * @param uresult An object created by unumf_openResult(). * @stable ICU 68 */ U_CAPI void U_EXPORT2 unumrf_closeResult(UFormattedNumberRange* uresult); #endif // (NTDDI_VERSION >= NTDDI_WIN11_ZN) #endif /* #if !UCONFIG_NO_FORMATTING */ #endif //__UNUMBERRANGEFORMATTER_H__ #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) // unumsys.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2013-2014, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef UNUMSYS_H #define UNUMSYS_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: UNumberingSystem, information about numbering systems * * Defines numbering systems. A numbering system describes the scheme by which * numbers are to be presented to the end user. In its simplest form, a numbering * system describes the set of digit characters that are to be used to display * numbers, such as Western digits, Thai digits, Arabic-Indic digits, etc., in a * positional numbering system with a specified radix (typically 10). * More complicated numbering systems are algorithmic in nature, and require use * of an RBNF formatter (rule based number formatter), in order to calculate * the characters to be displayed for a given number. Examples of algorithmic * numbering systems include Roman numerals, Chinese numerals, and Hebrew numerals. * Formatting rules for many commonly used numbering systems are included in * the ICU package, based on the numbering system rules defined in CLDR. * Alternate numbering systems can be specified to a locale by using the * numbers locale keyword. */ /** * Opaque UNumberingSystem object for use in C programs. * @stable ICU 52 */ struct UNumberingSystem; typedef struct UNumberingSystem UNumberingSystem; /**< C typedef for struct UNumberingSystem. @stable ICU 52 */ /** * Opens a UNumberingSystem object using the default numbering system for the specified * locale. * @param locale The locale for which the default numbering system should be opened. * @param status A pointer to a UErrorCode to receive any errors. For example, this * may be U_UNSUPPORTED_ERROR for a locale such as "en@numbers=xyz" that * specifies a numbering system unknown to ICU. * @return A UNumberingSystem for the specified locale, or NULL if an error * occurred. * @stable ICU 52 */ U_CAPI UNumberingSystem * U_EXPORT2 unumsys_open(const char *locale, UErrorCode *status); /** * Opens a UNumberingSystem object using the name of one of the predefined numbering * systems specified by CLDR and known to ICU, such as "latn", "arabext", or "hanidec"; * the full list is returned by unumsys_openAvailableNames. Note that some of the names * listed at http://unicode.org/repos/cldr/tags/latest/common/bcp47/number.xml - e.g. * default, native, traditional, finance - do not identify specific numbering systems, * but rather key values that may only be used as part of a locale, which in turn * defines how they are mapped to a specific numbering system such as "latn" or "hant". * * @param name The name of the numbering system for which a UNumberingSystem object * should be opened. * @param status A pointer to a UErrorCode to receive any errors. For example, this * may be U_UNSUPPORTED_ERROR for a numbering system such as "xyz" that * is unknown to ICU. * @return A UNumberingSystem for the specified name, or NULL if an error * occurred. * @stable ICU 52 */ U_CAPI UNumberingSystem * U_EXPORT2 unumsys_openByName(const char *name, UErrorCode *status); /** * Close a UNumberingSystem object. Once closed it may no longer be used. * @param unumsys The UNumberingSystem object to close. * @stable ICU 52 */ U_CAPI void U_EXPORT2 unumsys_close(UNumberingSystem *unumsys); /** * Returns an enumeration over the names of all of the predefined numbering systems known * to ICU. * The numbering system names will be in alphabetical (invariant) order. * @param status A pointer to a UErrorCode to receive any errors. * @return A pointer to a UEnumeration that must be closed with uenum_close(), * or NULL if an error occurred. * @stable ICU 52 */ U_CAPI UEnumeration * U_EXPORT2 unumsys_openAvailableNames(UErrorCode *status); /** * Returns the name of the specified UNumberingSystem object (if it is one of the * predefined names known to ICU). * @param unumsys The UNumberingSystem whose name is desired. * @return A pointer to the name of the specified UNumberingSystem object, or * NULL if the name is not one of the ICU predefined names. The pointer * is only valid for the lifetime of the UNumberingSystem object. * @stable ICU 52 */ U_CAPI const char * U_EXPORT2 unumsys_getName(const UNumberingSystem *unumsys); /** * Returns whether the given UNumberingSystem object is for an algorithmic (not purely * positional) system. * @param unumsys The UNumberingSystem whose algorithmic status is desired. * @return true if the specified UNumberingSystem object is for an algorithmic * system. * @stable ICU 52 */ U_CAPI UBool U_EXPORT2 unumsys_isAlgorithmic(const UNumberingSystem *unumsys); /** * Returns the radix of the specified UNumberingSystem object. Simple positional * numbering systems typically have radix 10, but might have a radix of e.g. 16 for * hexadecimal. The radix is less well-defined for non-positional algorithmic systems. * @param unumsys The UNumberingSystem whose radix is desired. * @return The radix of the specified UNumberingSystem object. * @stable ICU 52 */ U_CAPI int32_t U_EXPORT2 unumsys_getRadix(const UNumberingSystem *unumsys); /** * Get the description string of the specified UNumberingSystem object. For simple * positional systems this is the ordered string of digits (with length matching * the radix), e.g. "\u3007\u4E00\u4E8C\u4E09\u56DB\u4E94\u516D\u4E03\u516B\u4E5D" * for "hanidec"; it would be "0123456789ABCDEF" for hexadecimal. For * algorithmic systems this is the name of the RBNF ruleset used for formatting, * e.g. "zh/SpelloutRules/%spellout-cardinal" for "hans" or "%greek-upper" for * "grek". * @param unumsys The UNumberingSystem whose description string is desired. * @param result A pointer to a buffer to receive the description string. * @param resultLength The maximum size of result. * @param status A pointer to a UErrorCode to receive any errors. * @return The total buffer size needed; if greater than resultLength, the * output was truncated. * @stable ICU 52 */ U_CAPI int32_t U_EXPORT2 unumsys_getDescription(const UNumberingSystem *unumsys, UChar *result, int32_t resultLength, UErrorCode *status); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // upluralrules.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2010-2013, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef UPLURALRULES_H #define UPLURALRULES_H #if !UCONFIG_NO_FORMATTING // Forward-declaration struct UFormattedNumber; #if (NTDDI_VERSION >= NTDDI_WIN10_CO) struct UFormattedNumberRange; #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * \file * \brief C API: Plural rules, select plural keywords for numeric values. * * A UPluralRules object defines rules for mapping non-negative numeric * values onto a small set of keywords. Rules are constructed from a text * description, consisting of a series of keywords and conditions. * The uplrules_select function examines each condition in order and * returns the keyword for the first condition that matches the number. * If none match, the default rule(other) is returned. * * For more information, see the * LDML spec, Part 3.5 Language Plural Rules: * https://www.unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules * * Keywords: ICU locale data has 6 predefined values - * 'zero', 'one', 'two', 'few', 'many' and 'other'. Callers need to check * the value of keyword returned by the uplrules_select function. * * These are based on CLDR Language Plural Rules. For these * predefined rules, see the CLDR page at * https://unicode-org.github.io/cldr-staging/charts/latest/supplemental/language_plural_rules.html */ /** * Type of plurals and PluralRules. * @stable ICU 50 */ enum UPluralType { /** * Plural rules for cardinal numbers: 1 file vs. 2 files. * @stable ICU 50 */ UPLURAL_TYPE_CARDINAL, /** * Plural rules for ordinal numbers: 1st file, 2nd file, 3rd file, 4th file, etc. * @stable ICU 50 */ UPLURAL_TYPE_ORDINAL, }; /** * @stable ICU 50 */ typedef enum UPluralType UPluralType; /** * Opaque UPluralRules object for use in C programs. * @stable ICU 4.8 */ struct UPluralRules; typedef struct UPluralRules UPluralRules; /**< C typedef for struct UPluralRules. @stable ICU 4.8 */ /** * Opens a new UPluralRules object using the predefined cardinal-number plural rules for a * given locale. * Same as uplrules_openForType(locale, UPLURAL_TYPE_CARDINAL, status). * @param locale The locale for which the rules are desired. * @param status A pointer to a UErrorCode to receive any errors. * @return A UPluralRules for the specified locale, or NULL if an error occurred. * @stable ICU 4.8 */ U_CAPI UPluralRules* U_EXPORT2 uplrules_open(const char *locale, UErrorCode *status); /** * Opens a new UPluralRules object using the predefined plural rules for a * given locale and the plural type. * @param locale The locale for which the rules are desired. * @param type The plural type (e.g., cardinal or ordinal). * @param status A pointer to a UErrorCode to receive any errors. * @return A UPluralRules for the specified locale, or NULL if an error occurred. * @stable ICU 50 */ U_CAPI UPluralRules* U_EXPORT2 uplrules_openForType(const char *locale, UPluralType type, UErrorCode *status); /** * Closes a UPluralRules object. Once closed it may no longer be used. * @param uplrules The UPluralRules object to close. * @stable ICU 4.8 */ U_CAPI void U_EXPORT2 uplrules_close(UPluralRules *uplrules); /** * Given a floating-point number, returns the keyword of the first rule that * applies to the number, according to the supplied UPluralRules object. * @param uplrules The UPluralRules object specifying the rules. * @param number The number for which the rule has to be determined. * @param keyword An output buffer to write the keyword of the rule that * applies to number. * @param capacity The capacity of the keyword buffer. * @param status A pointer to a UErrorCode to receive any errors. * @return The length of the keyword. * @stable ICU 4.8 */ U_CAPI int32_t U_EXPORT2 uplrules_select(const UPluralRules *uplrules, double number, UChar *keyword, int32_t capacity, UErrorCode *status); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Given a formatted number, returns the keyword of the first rule * that applies to the number, according to the supplied UPluralRules object. * * A UFormattedNumber allows you to specify an exponent or trailing zeros, * which can affect the plural category. To get a UFormattedNumber, see * {@link UNumberFormatter}. * * @param uplrules The UPluralRules object specifying the rules. * @param number The formatted number for which the rule has to be determined. * @param keyword The destination buffer for the keyword of the rule that * applies to the number. * @param capacity The capacity of the keyword buffer. * @param status A pointer to a UErrorCode to receive any errors. * @return The length of the keyword. * @stable ICU 64 */ U_CAPI int32_t U_EXPORT2 uplrules_selectFormatted(const UPluralRules *uplrules, const struct UFormattedNumber* number, UChar *keyword, int32_t capacity, UErrorCode *status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) #if (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Creates a string enumeration of all plural rule keywords used in this * UPluralRules object. The rule "other" is always present by default. * @param uplrules The UPluralRules object specifying the rules for * a given locale. * @param status A pointer to a UErrorCode to receive any errors. * @return a string enumeration over plural rule keywords, or NULL * upon error. The caller is responsible for closing the result. * @stable ICU 59 */ U_CAPI UEnumeration* U_EXPORT2 uplrules_getKeywords(const UPluralRules *uplrules, UErrorCode *status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5) #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // uregex.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2004-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: uregex.h * encoding: UTF-8 * indentation:4 * * created on: 2004mar09 * created by: Andy Heninger * * ICU Regular Expressions, API for C */ /** * \file * \brief C API: Regular Expressions * *

This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.

*/ #ifndef UREGEX_H #define UREGEX_H #if !UCONFIG_NO_REGULAR_EXPRESSIONS struct URegularExpression; /** * Structure representing a compiled regular expression, plus the results * of a match operation. * @stable ICU 3.0 */ typedef struct URegularExpression URegularExpression; /** * Constants for Regular Expression Match Modes. * @stable ICU 2.4 */ typedef enum URegexpFlag{ /** Enable case insensitive matching. @stable ICU 2.4 */ UREGEX_CASE_INSENSITIVE = 2, /** Allow white space and comments within patterns @stable ICU 2.4 */ UREGEX_COMMENTS = 4, /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. * @stable ICU 2.4 */ UREGEX_DOTALL = 32, /** If set, treat the entire pattern as a literal string. * Metacharacters or escape sequences in the input sequence will be given * no special meaning. * * The flag UREGEX_CASE_INSENSITIVE retains its impact * on matching when used in conjunction with this flag. * The other flags become superfluous. * * @stable ICU 4.0 */ UREGEX_LITERAL = 16, /** Control behavior of "$" and "^" * If set, recognize line terminators within string, * otherwise, match only at start and end of input string. * @stable ICU 2.4 */ UREGEX_MULTILINE = 8, /** Unix-only line endings. * When this mode is enabled, only \\u000a is recognized as a line ending * in the behavior of ., ^, and $. * @stable ICU 4.0 */ UREGEX_UNIX_LINES = 1, /** Unicode word boundaries. * If set, \b uses the Unicode TR 29 definition of word boundaries. * Warning: Unicode word boundaries are quite different from * traditional regular expression word boundaries. See * http://unicode.org/reports/tr29/#Word_Boundaries * @stable ICU 2.8 */ UREGEX_UWORD = 256, /** Error on Unrecognized backslash escapes. * If set, fail with an error on patterns that contain * backslash-escaped ASCII letters without a known special * meaning. If this flag is not set, these * escaped letters represent themselves. * @stable ICU 4.0 */ UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 } URegexpFlag; /** * Open (compile) an ICU regular expression. Compiles the regular expression in * string form into an internal representation using the specified match mode flags. * The resulting regular expression handle can then be used to perform various * matching operations. * * * @param pattern The Regular Expression pattern to be compiled. * @param patternLength The length of the pattern, or -1 if the pattern is * NUL terminated. * @param flags Flags that alter the default matching behavior for * the regular expression, UREGEX_CASE_INSENSITIVE, for * example. For default behavior, set this parameter to zero. * See enum URegexpFlag. All desired flags * are bitwise-ORed together. * @param pe Receives the position (line and column numbers) of any syntax * error within the source regular expression string. If this * information is not wanted, pass NULL for this parameter. * @param status Receives error detected by this function. * @stable ICU 3.0 * */ U_CAPI URegularExpression * U_EXPORT2 uregex_open( const UChar *pattern, int32_t patternLength, uint32_t flags, UParseError *pe, UErrorCode *status); /** * Open (compile) an ICU regular expression. Compiles the regular expression in * string form into an internal representation using the specified match mode flags. * The resulting regular expression handle can then be used to perform various * matching operations. *

* The contents of the pattern UText will be extracted and saved. Ownership of the * UText struct itself remains with the caller. This is to match the behavior of * uregex_open(). * * @param pattern The Regular Expression pattern to be compiled. * @param flags Flags that alter the default matching behavior for * the regular expression, UREGEX_CASE_INSENSITIVE, for * example. For default behavior, set this parameter to zero. * See enum URegexpFlag. All desired flags * are bitwise-ORed together. * @param pe Receives the position (line and column numbers) of any syntax * error within the source regular expression string. If this * information is not wanted, pass NULL for this parameter. * @param status Receives error detected by this function. * * @stable ICU 4.6 */ U_CAPI URegularExpression * U_EXPORT2 uregex_openUText(UText *pattern, uint32_t flags, UParseError *pe, UErrorCode *status); #if !UCONFIG_NO_CONVERSION /** * Open (compile) an ICU regular expression. The resulting regular expression * handle can then be used to perform various matching operations. *

* This function is the same as uregex_open, except that the pattern * is supplied as an 8 bit char * string in the default code page. * * @param pattern The Regular Expression pattern to be compiled, * NUL terminated. * @param flags Flags that alter the default matching behavior for * the regular expression, UREGEX_CASE_INSENSITIVE, for * example. For default behavior, set this parameter to zero. * See enum URegexpFlag. All desired flags * are bitwise-ORed together. * @param pe Receives the position (line and column numbers) of any syntax * error within the source regular expression string. If this * information is not wanted, pass NULL for this parameter. * @param status Receives errors detected by this function. * @return The URegularExpression object representing the compiled * pattern. * * @stable ICU 3.0 */ U_CAPI URegularExpression * U_EXPORT2 uregex_openC( const char *pattern, uint32_t flags, UParseError *pe, UErrorCode *status); #endif /** * Close the regular expression, recovering all resources (memory) it * was holding. * * @param regexp The regular expression to be closed. * @stable ICU 3.0 */ U_CAPI void U_EXPORT2 uregex_close(URegularExpression *regexp); /** * Make a copy of a compiled regular expression. Cloning a regular * expression is faster than opening a second instance from the source * form of the expression, and requires less memory. *

* Note that the current input string and the position of any matched text * within it are not cloned; only the pattern itself and the * match mode flags are copied. *

* Cloning can be particularly useful to threaded applications that perform * multiple match operations in parallel. Each concurrent RE * operation requires its own instance of a URegularExpression. * * @param regexp The compiled regular expression to be cloned. * @param status Receives indication of any errors encountered * @return the cloned copy of the compiled regular expression. * @stable ICU 3.0 */ U_CAPI URegularExpression * U_EXPORT2 uregex_clone(const URegularExpression *regexp, UErrorCode *status); /** * Returns a pointer to the source form of the pattern for this regular expression. * This function will work even if the pattern was originally specified as a UText. * * @param regexp The compiled regular expression. * @param patLength This output parameter will be set to the length of the * pattern string. A NULL pointer may be used here if the * pattern length is not needed, as would be the case if * the pattern is known in advance to be a NUL terminated * string. * @param status Receives errors detected by this function. * @return a pointer to the pattern string. The storage for the string is * owned by the regular expression object, and must not be * altered or deleted by the application. The returned string * will remain valid until the regular expression is closed. * @stable ICU 3.0 */ U_CAPI const UChar * U_EXPORT2 uregex_pattern(const URegularExpression *regexp, int32_t *patLength, UErrorCode *status); /** * Returns the source text of the pattern for this regular expression. * This function will work even if the pattern was originally specified as a UChar string. * * @param regexp The compiled regular expression. * @param status Receives errors detected by this function. * @return the pattern text. The storage for the text is owned by the regular expression * object, and must not be altered or deleted. * * @stable ICU 4.6 */ U_CAPI UText * U_EXPORT2 uregex_patternUText(const URegularExpression *regexp, UErrorCode *status); /** * Get the match mode flags that were specified when compiling this regular expression. * @param status Receives errors detected by this function. * @param regexp The compiled regular expression. * @return The match mode flags * @see URegexpFlag * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_flags(const URegularExpression *regexp, UErrorCode *status); /** * Set the subject text string upon which the regular expression will look for matches. * This function may be called any number of times, allowing the regular * expression pattern to be applied to different strings. *

* Regular expression matching operations work directly on the application's * string data. No copy is made. The subject string data must not be * altered after calling this function until after all regular expression * operations involving this string data are completed. *

* Zero length strings are permitted. In this case, no subsequent match * operation will dereference the text string pointer. * * @param regexp The compiled regular expression. * @param text The subject text string. * @param textLength The length of the subject text, or -1 if the string * is NUL terminated. * @param status Receives errors detected by this function. * @stable ICU 3.0 */ U_CAPI void U_EXPORT2 uregex_setText(URegularExpression *regexp, const UChar *text, int32_t textLength, UErrorCode *status); /** * Set the subject text string upon which the regular expression will look for matches. * This function may be called any number of times, allowing the regular * expression pattern to be applied to different strings. *

* Regular expression matching operations work directly on the application's * string data; only a shallow clone is made. The subject string data must not be * altered after calling this function until after all regular expression * operations involving this string data are completed. * * @param regexp The compiled regular expression. * @param text The subject text string. * @param status Receives errors detected by this function. * * @stable ICU 4.6 */ U_CAPI void U_EXPORT2 uregex_setUText(URegularExpression *regexp, UText *text, UErrorCode *status); /** * Get the subject text that is currently associated with this * regular expression object. If the input was supplied using uregex_setText(), * that pointer will be returned. Otherwise, the characters in the input will * be extracted to a buffer and returned. In either case, ownership remains * with the regular expression object. * * This function will work even if the input was originally specified as a UText. * * @param regexp The compiled regular expression. * @param textLength The length of the string is returned in this output parameter. * A NULL pointer may be used here if the * text length is not needed, as would be the case if * the text is known in advance to be a NUL terminated * string. * @param status Receives errors detected by this function. * @return Pointer to the subject text string currently associated with * this regular expression. * @stable ICU 3.0 */ U_CAPI const UChar * U_EXPORT2 uregex_getText(URegularExpression *regexp, int32_t *textLength, UErrorCode *status); /** * Get the subject text that is currently associated with this * regular expression object. * * This function will work even if the input was originally specified as a UChar string. * * @param regexp The compiled regular expression. * @param dest A mutable UText in which to store the current input. * If NULL, a new UText will be created as an immutable shallow clone * of the actual input string. * @param status Receives errors detected by this function. * @return The subject text currently associated with this regular expression. * If a pre-allocated UText was provided, it will always be used and returned. * * @stable ICU 4.6 */ U_CAPI UText * U_EXPORT2 uregex_getUText(URegularExpression *regexp, UText *dest, UErrorCode *status); /** * Set the subject text string upon which the regular expression is looking for matches * without changing any other aspect of the matching state. * The new and previous text strings must have the same content. * * This function is intended for use in environments where ICU is operating on * strings that may move around in memory. It provides a mechanism for notifying * ICU that the string has been relocated, and providing a new UText to access the * string in its new position. * * Note that the regular expression implementation never copies the underlying text * of a string being matched, but always operates directly on the original text * provided by the user. Refreshing simply drops the references to the old text * and replaces them with references to the new. * * Caution: this function is normally used only by very specialized * system-level code. One example use case is with garbage collection * that moves the text in memory. * * @param regexp The compiled regular expression. * @param text The new (moved) text string. * @param status Receives errors detected by this function. * * @stable ICU 4.8 */ U_CAPI void U_EXPORT2 uregex_refreshUText(URegularExpression *regexp, UText *text, UErrorCode *status); /** * Attempts to match the input string against the pattern. * To succeed, the match must extend to the end of the string, * or cover the complete match region. * * If startIndex >= zero the match operation starts at the specified * index and must extend to the end of the input string. Any region * that has been specified is reset. * * If startIndex == -1 the match must cover the input region, or the entire * input string if no region has been set. This directly corresponds to * Matcher.matches() in Java * * @param regexp The compiled regular expression. * @param startIndex The input string (native) index at which to begin matching, or -1 * to match the input Region. * @param status Receives errors detected by this function. * @return true if there is a match * @stable ICU 3.0 */ U_CAPI UBool U_EXPORT2 uregex_matches(URegularExpression *regexp, int32_t startIndex, UErrorCode *status); /** * 64bit version of uregex_matches. * Attempts to match the input string against the pattern. * To succeed, the match must extend to the end of the string, * or cover the complete match region. * * If startIndex >= zero the match operation starts at the specified * index and must extend to the end of the input string. Any region * that has been specified is reset. * * If startIndex == -1 the match must cover the input region, or the entire * input string if no region has been set. This directly corresponds to * Matcher.matches() in Java * * @param regexp The compiled regular expression. * @param startIndex The input string (native) index at which to begin matching, or -1 * to match the input Region. * @param status Receives errors detected by this function. * @return true if there is a match * @stable ICU 4.6 */ U_CAPI UBool U_EXPORT2 uregex_matches64(URegularExpression *regexp, int64_t startIndex, UErrorCode *status); /** * Attempts to match the input string, starting from the specified index, against the pattern. * The match may be of any length, and is not required to extend to the end * of the input string. Contrast with uregex_matches(). * *

If startIndex is >= 0 any input region that was set for this * URegularExpression is reset before the operation begins. * *

If the specified starting index == -1 the match begins at the start of the input * region, or at the start of the full string if no region has been specified. * This corresponds directly with Matcher.lookingAt() in Java. * *

If the match succeeds then more information can be obtained via the * uregexp_start(), uregexp_end(), * and uregex_group() functions.

* * @param regexp The compiled regular expression. * @param startIndex The input string (native) index at which to begin matching, or * -1 to match the Input Region * @param status A reference to a UErrorCode to receive any errors. * @return true if there is a match. * @stable ICU 3.0 */ U_CAPI UBool U_EXPORT2 uregex_lookingAt(URegularExpression *regexp, int32_t startIndex, UErrorCode *status); /** * 64bit version of uregex_lookingAt. * Attempts to match the input string, starting from the specified index, against the pattern. * The match may be of any length, and is not required to extend to the end * of the input string. Contrast with uregex_matches(). * *

If startIndex is >= 0 any input region that was set for this * URegularExpression is reset before the operation begins. * *

If the specified starting index == -1 the match begins at the start of the input * region, or at the start of the full string if no region has been specified. * This corresponds directly with Matcher.lookingAt() in Java. * *

If the match succeeds then more information can be obtained via the * uregexp_start(), uregexp_end(), * and uregex_group() functions.

* * @param regexp The compiled regular expression. * @param startIndex The input string (native) index at which to begin matching, or * -1 to match the Input Region * @param status A reference to a UErrorCode to receive any errors. * @return true if there is a match. * @stable ICU 4.6 */ U_CAPI UBool U_EXPORT2 uregex_lookingAt64(URegularExpression *regexp, int64_t startIndex, UErrorCode *status); /** * Find the first matching substring of the input string that matches the pattern. * If startIndex is >= zero the search for a match begins at the specified index, * and any match region is reset. This corresponds directly with * Matcher.find(startIndex) in Java. * * If startIndex == -1 the search begins at the start of the input region, * or at the start of the full string if no region has been specified. * * If a match is found, uregex_start(), uregex_end(), and * uregex_group() will provide more information regarding the match. * * @param regexp The compiled regular expression. * @param startIndex The position (native) in the input string to begin the search, or * -1 to search within the Input Region. * @param status A reference to a UErrorCode to receive any errors. * @return true if a match is found. * @stable ICU 3.0 */ U_CAPI UBool U_EXPORT2 uregex_find(URegularExpression *regexp, int32_t startIndex, UErrorCode *status); /** * 64bit version of uregex_find. * Find the first matching substring of the input string that matches the pattern. * If startIndex is >= zero the search for a match begins at the specified index, * and any match region is reset. This corresponds directly with * Matcher.find(startIndex) in Java. * * If startIndex == -1 the search begins at the start of the input region, * or at the start of the full string if no region has been specified. * * If a match is found, uregex_start(), uregex_end(), and * uregex_group() will provide more information regarding the match. * * @param regexp The compiled regular expression. * @param startIndex The position (native) in the input string to begin the search, or * -1 to search within the Input Region. * @param status A reference to a UErrorCode to receive any errors. * @return true if a match is found. * @stable ICU 4.6 */ U_CAPI UBool U_EXPORT2 uregex_find64(URegularExpression *regexp, int64_t startIndex, UErrorCode *status); /** * Find the next pattern match in the input string. Begin searching * the input at the location following the end of he previous match, * or at the start of the string (or region) if there is no * previous match. If a match is found, uregex_start(), uregex_end(), and * uregex_group() will provide more information regarding the match. * * @param regexp The compiled regular expression. * @param status A reference to a UErrorCode to receive any errors. * @return true if a match is found. * @see uregex_reset * @stable ICU 3.0 */ U_CAPI UBool U_EXPORT2 uregex_findNext(URegularExpression *regexp, UErrorCode *status); /** * Get the number of capturing groups in this regular expression's pattern. * @param regexp The compiled regular expression. * @param status A reference to a UErrorCode to receive any errors. * @return the number of capture groups * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_groupCount(URegularExpression *regexp, UErrorCode *status); /** * Get the group number corresponding to a named capture group. * The returned number can be used with any function that access * capture groups by number. * * The function returns an error status if the specified name does not * appear in the pattern. * * @param regexp The compiled regular expression. * @param groupName The capture group name. * @param nameLength The length of the name, or -1 if the name is a * nul-terminated string. * @param status A pointer to a UErrorCode to receive any errors. * * @stable ICU 55 */ U_CAPI int32_t U_EXPORT2 uregex_groupNumberFromName(URegularExpression *regexp, const UChar *groupName, int32_t nameLength, UErrorCode *status); /** * Get the group number corresponding to a named capture group. * The returned number can be used with any function that access * capture groups by number. * * The function returns an error status if the specified name does not * appear in the pattern. * * @param regexp The compiled regular expression. * @param groupName The capture group name, * platform invariant characters only. * @param nameLength The length of the name, or -1 if the name is * nul-terminated. * @param status A pointer to a UErrorCode to receive any errors. * * @stable ICU 55 */ U_CAPI int32_t U_EXPORT2 uregex_groupNumberFromCName(URegularExpression *regexp, const char *groupName, int32_t nameLength, UErrorCode *status); /** Extract the string for the specified matching expression or subexpression. * Group #0 is the complete string of matched text. * Group #1 is the text matched by the first set of capturing parentheses. * * @param regexp The compiled regular expression. * @param groupNum The capture group to extract. Group 0 is the complete * match. The value of this parameter must be * less than or equal to the number of capture groups in * the pattern. * @param dest Buffer to receive the matching string data * @param destCapacity Capacity of the dest buffer. * @param status A reference to a UErrorCode to receive any errors. * @return Length of matching data, * or -1 if no applicable match. * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_group(URegularExpression *regexp, int32_t groupNum, UChar *dest, int32_t destCapacity, UErrorCode *status); /** Returns a shallow immutable clone of the entire input string with the current index set * to the beginning of the requested capture group. The capture group length is also * returned via groupLength. * Group #0 is the complete string of matched text. * Group #1 is the text matched by the first set of capturing parentheses. * * @param regexp The compiled regular expression. * @param groupNum The capture group to extract. Group 0 is the complete * match. The value of this parameter must be * less than or equal to the number of capture groups in * the pattern. * @param dest A mutable UText in which to store the current input. * If NULL, a new UText will be created as an immutable shallow clone * of the entire input string. * @param groupLength The group length of the desired capture group. Output parameter. * @param status A reference to a UErrorCode to receive any errors. * @return The subject text currently associated with this regular expression. * If a pre-allocated UText was provided, it will always be used and returned. * * @stable ICU 4.6 */ U_CAPI UText * U_EXPORT2 uregex_groupUText(URegularExpression *regexp, int32_t groupNum, UText *dest, int64_t *groupLength, UErrorCode *status); /** * Returns the index in the input string of the start of the text matched by the * specified capture group during the previous match operation. Return -1 if * the capture group was not part of the last match. * Group #0 refers to the complete range of matched text. * Group #1 refers to the text matched by the first set of capturing parentheses. * * @param regexp The compiled regular expression. * @param groupNum The capture group number * @param status A reference to a UErrorCode to receive any errors. * @return the starting (native) position in the input of the text matched * by the specified group. * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_start(URegularExpression *regexp, int32_t groupNum, UErrorCode *status); /** * 64bit version of uregex_start. * Returns the index in the input string of the start of the text matched by the * specified capture group during the previous match operation. Return -1 if * the capture group was not part of the last match. * Group #0 refers to the complete range of matched text. * Group #1 refers to the text matched by the first set of capturing parentheses. * * @param regexp The compiled regular expression. * @param groupNum The capture group number * @param status A reference to a UErrorCode to receive any errors. * @return the starting (native) position in the input of the text matched * by the specified group. * @stable ICU 4.6 */ U_CAPI int64_t U_EXPORT2 uregex_start64(URegularExpression *regexp, int32_t groupNum, UErrorCode *status); /** * Returns the index in the input string of the position following the end * of the text matched by the specified capture group. * Return -1 if the capture group was not part of the last match. * Group #0 refers to the complete range of matched text. * Group #1 refers to the text matched by the first set of capturing parentheses. * * @param regexp The compiled regular expression. * @param groupNum The capture group number * @param status A reference to a UErrorCode to receive any errors. * @return the (native) index of the position following the last matched character. * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_end(URegularExpression *regexp, int32_t groupNum, UErrorCode *status); /** * 64bit version of uregex_end. * Returns the index in the input string of the position following the end * of the text matched by the specified capture group. * Return -1 if the capture group was not part of the last match. * Group #0 refers to the complete range of matched text. * Group #1 refers to the text matched by the first set of capturing parentheses. * * @param regexp The compiled regular expression. * @param groupNum The capture group number * @param status A reference to a UErrorCode to receive any errors. * @return the (native) index of the position following the last matched character. * @stable ICU 4.6 */ U_CAPI int64_t U_EXPORT2 uregex_end64(URegularExpression *regexp, int32_t groupNum, UErrorCode *status); /** * Reset any saved state from the previous match. Has the effect of * causing uregex_findNext to begin at the specified index, and causing * uregex_start(), uregex_end() and uregex_group() to return an error * indicating that there is no match information available. Clears any * match region that may have been set. * * @param regexp The compiled regular expression. * @param index The position (native) in the text at which a * uregex_findNext() should begin searching. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 3.0 */ U_CAPI void U_EXPORT2 uregex_reset(URegularExpression *regexp, int32_t index, UErrorCode *status); /** * 64bit version of uregex_reset. * Reset any saved state from the previous match. Has the effect of * causing uregex_findNext to begin at the specified index, and causing * uregex_start(), uregex_end() and uregex_group() to return an error * indicating that there is no match information available. Clears any * match region that may have been set. * * @param regexp The compiled regular expression. * @param index The position (native) in the text at which a * uregex_findNext() should begin searching. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.6 */ U_CAPI void U_EXPORT2 uregex_reset64(URegularExpression *regexp, int64_t index, UErrorCode *status); /** * Sets the limits of the matching region for this URegularExpression. * The region is the part of the input string that will be considered when matching. * Invoking this method resets any saved state from the previous match, * then sets the region to start at the index specified by the start parameter * and end at the index specified by the end parameter. * * Depending on the transparency and anchoring being used (see useTransparentBounds * and useAnchoringBounds), certain constructs such as anchors may behave differently * at or around the boundaries of the region * * The function will fail if start is greater than limit, or if either index * is less than zero or greater than the length of the string being matched. * * @param regexp The compiled regular expression. * @param regionStart The (native) index to begin searches at. * @param regionLimit The (native) index to end searches at (exclusive). * @param status A pointer to a UErrorCode to receive any errors. * @stable ICU 4.0 */ U_CAPI void U_EXPORT2 uregex_setRegion(URegularExpression *regexp, int32_t regionStart, int32_t regionLimit, UErrorCode *status); /** * 64bit version of uregex_setRegion. * Sets the limits of the matching region for this URegularExpression. * The region is the part of the input string that will be considered when matching. * Invoking this method resets any saved state from the previous match, * then sets the region to start at the index specified by the start parameter * and end at the index specified by the end parameter. * * Depending on the transparency and anchoring being used (see useTransparentBounds * and useAnchoringBounds), certain constructs such as anchors may behave differently * at or around the boundaries of the region * * The function will fail if start is greater than limit, or if either index * is less than zero or greater than the length of the string being matched. * * @param regexp The compiled regular expression. * @param regionStart The (native) index to begin searches at. * @param regionLimit The (native) index to end searches at (exclusive). * @param status A pointer to a UErrorCode to receive any errors. * @stable ICU 4.6 */ U_CAPI void U_EXPORT2 uregex_setRegion64(URegularExpression *regexp, int64_t regionStart, int64_t regionLimit, UErrorCode *status); /** * Set the matching region and the starting index for subsequent matches * in a single operation. * This is useful because the usual function for setting the starting * index, urgex_reset(), also resets any region limits. * * @param regexp The compiled regular expression. * @param regionStart The (native) index to begin searches at. * @param regionLimit The (native) index to end searches at (exclusive). * @param startIndex The index in the input text at which the next * match operation should begin. * @param status A pointer to a UErrorCode to receive any errors. * @stable ICU 4.6 */ U_CAPI void U_EXPORT2 uregex_setRegionAndStart(URegularExpression *regexp, int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode *status); /** * Reports the start index of the matching region. Any matches found are limited to * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). * * @param regexp The compiled regular expression. * @param status A pointer to a UErrorCode to receive any errors. * @return The starting (native) index of this matcher's region. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 uregex_regionStart(const URegularExpression *regexp, UErrorCode *status); /** * 64bit version of uregex_regionStart. * Reports the start index of the matching region. Any matches found are limited to * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). * * @param regexp The compiled regular expression. * @param status A pointer to a UErrorCode to receive any errors. * @return The starting (native) index of this matcher's region. * @stable ICU 4.6 */ U_CAPI int64_t U_EXPORT2 uregex_regionStart64(const URegularExpression *regexp, UErrorCode *status); /** * Reports the end index (exclusive) of the matching region for this URegularExpression. * Any matches found are limited to to the region bounded by regionStart (inclusive) * and regionEnd (exclusive). * * @param regexp The compiled regular expression. * @param status A pointer to a UErrorCode to receive any errors. * @return The ending point (native) of this matcher's region. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 uregex_regionEnd(const URegularExpression *regexp, UErrorCode *status); /** * 64bit version of uregex_regionEnd. * Reports the end index (exclusive) of the matching region for this URegularExpression. * Any matches found are limited to to the region bounded by regionStart (inclusive) * and regionEnd (exclusive). * * @param regexp The compiled regular expression. * @param status A pointer to a UErrorCode to receive any errors. * @return The ending point (native) of this matcher's region. * @stable ICU 4.6 */ U_CAPI int64_t U_EXPORT2 uregex_regionEnd64(const URegularExpression *regexp, UErrorCode *status); /** * Queries the transparency of region bounds for this URegularExpression. * See useTransparentBounds for a description of transparent and opaque bounds. * By default, matching boundaries are opaque. * * @param regexp The compiled regular expression. * @param status A pointer to a UErrorCode to receive any errors. * @return true if this matcher is using opaque bounds, false if it is not. * @stable ICU 4.0 */ U_CAPI UBool U_EXPORT2 uregex_hasTransparentBounds(const URegularExpression *regexp, UErrorCode *status); /** * Sets the transparency of region bounds for this URegularExpression. * Invoking this function with an argument of true will set matches to use transparent bounds. * If the boolean argument is false, then opaque bounds will be used. * * Using transparent bounds, the boundaries of the matching region are transparent * to lookahead, lookbehind, and boundary matching constructs. Those constructs can * see text beyond the boundaries of the region while checking for a match. * * With opaque bounds, no text outside of the matching region is visible to lookahead, * lookbehind, and boundary matching constructs. * * By default, opaque bounds are used. * * @param regexp The compiled regular expression. * @param b true for transparent bounds; false for opaque bounds * @param status A pointer to a UErrorCode to receive any errors. * @stable ICU 4.0 **/ U_CAPI void U_EXPORT2 uregex_useTransparentBounds(URegularExpression *regexp, UBool b, UErrorCode *status); /** * Return true if this URegularExpression is using anchoring bounds. * By default, anchoring region bounds are used. * * @param regexp The compiled regular expression. * @param status A pointer to a UErrorCode to receive any errors. * @return true if this matcher is using anchoring bounds. * @stable ICU 4.0 */ U_CAPI UBool U_EXPORT2 uregex_hasAnchoringBounds(const URegularExpression *regexp, UErrorCode *status); /** * Set whether this URegularExpression is using Anchoring Bounds for its region. * With anchoring bounds, pattern anchors such as ^ and $ will match at the start * and end of the region. Without Anchoring Bounds, anchors will only match at * the positions they would in the complete text. * * Anchoring Bounds are the default for regions. * * @param regexp The compiled regular expression. * @param b true if to enable anchoring bounds; false to disable them. * @param status A pointer to a UErrorCode to receive any errors. * @stable ICU 4.0 */ U_CAPI void U_EXPORT2 uregex_useAnchoringBounds(URegularExpression *regexp, UBool b, UErrorCode *status); /** * Return true if the most recent matching operation touched the * end of the text being processed. In this case, additional input text could * change the results of that match. * * @param regexp The compiled regular expression. * @param status A pointer to a UErrorCode to receive any errors. * @return true if the most recent match hit the end of input * @stable ICU 4.0 */ U_CAPI UBool U_EXPORT2 uregex_hitEnd(const URegularExpression *regexp, UErrorCode *status); /** * Return true the most recent match succeeded and additional input could cause * it to fail. If this function returns false and a match was found, then more input * might change the match but the match won't be lost. If a match was not found, * then requireEnd has no meaning. * * @param regexp The compiled regular expression. * @param status A pointer to a UErrorCode to receive any errors. * @return true if more input could cause the most recent match to no longer match. * @stable ICU 4.0 */ U_CAPI UBool U_EXPORT2 uregex_requireEnd(const URegularExpression *regexp, UErrorCode *status); /** * Replaces every substring of the input that matches the pattern * with the given replacement string. This is a convenience function that * provides a complete find-and-replace-all operation. * * This method scans the input string looking for matches of the pattern. * Input that is not part of any match is copied unchanged to the * destination buffer. Matched regions are replaced in the output * buffer by the replacement string. The replacement string may contain * references to capture groups; these take the form of $1, $2, etc. * * @param regexp The compiled regular expression. * @param replacementText A string containing the replacement text. * @param replacementLength The length of the replacement string, or * -1 if it is NUL terminated. * @param destBuf A (UChar *) buffer that will receive the result. * @param destCapacity The capacity of the destination buffer. * @param status A reference to a UErrorCode to receive any errors. * @return The length of the string resulting from the find * and replace operation. In the event that the * destination capacity is inadequate, the return value * is still the full length of the untruncated string. * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_replaceAll(URegularExpression *regexp, const UChar *replacementText, int32_t replacementLength, UChar *destBuf, int32_t destCapacity, UErrorCode *status); /** * Replaces every substring of the input that matches the pattern * with the given replacement string. This is a convenience function that * provides a complete find-and-replace-all operation. * * This method scans the input string looking for matches of the pattern. * Input that is not part of any match is copied unchanged to the * destination buffer. Matched regions are replaced in the output * buffer by the replacement string. The replacement string may contain * references to capture groups; these take the form of $1, $2, etc. * * @param regexp The compiled regular expression. * @param replacement A string containing the replacement text. * @param dest A mutable UText that will receive the result. * If NULL, a new UText will be created (which may not be mutable). * @param status A reference to a UErrorCode to receive any errors. * @return A UText containing the results of the find and replace. * If a pre-allocated UText was provided, it will always be used and returned. * * @stable ICU 4.6 */ U_CAPI UText * U_EXPORT2 uregex_replaceAllUText(URegularExpression *regexp, UText *replacement, UText *dest, UErrorCode *status); /** * Replaces the first substring of the input that matches the pattern * with the given replacement string. This is a convenience function that * provides a complete find-and-replace operation. * * This method scans the input string looking for a match of the pattern. * All input that is not part of the match is copied unchanged to the * destination buffer. The matched region is replaced in the output * buffer by the replacement string. The replacement string may contain * references to capture groups; these take the form of $1, $2, etc. * * @param regexp The compiled regular expression. * @param replacementText A string containing the replacement text. * @param replacementLength The length of the replacement string, or * -1 if it is NUL terminated. * @param destBuf A (UChar *) buffer that will receive the result. * @param destCapacity The capacity of the destination buffer. * @param status a reference to a UErrorCode to receive any errors. * @return The length of the string resulting from the find * and replace operation. In the event that the * destination capacity is inadequate, the return value * is still the full length of the untruncated string. * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_replaceFirst(URegularExpression *regexp, const UChar *replacementText, int32_t replacementLength, UChar *destBuf, int32_t destCapacity, UErrorCode *status); /** * Replaces the first substring of the input that matches the pattern * with the given replacement string. This is a convenience function that * provides a complete find-and-replace operation. * * This method scans the input string looking for a match of the pattern. * All input that is not part of the match is copied unchanged to the * destination buffer. The matched region is replaced in the output * buffer by the replacement string. The replacement string may contain * references to capture groups; these take the form of $1, $2, etc. * * @param regexp The compiled regular expression. * @param replacement A string containing the replacement text. * @param dest A mutable UText that will receive the result. * If NULL, a new UText will be created (which may not be mutable). * @param status A reference to a UErrorCode to receive any errors. * @return A UText containing the results of the find and replace. * If a pre-allocated UText was provided, it will always be used and returned. * * @stable ICU 4.6 */ U_CAPI UText * U_EXPORT2 uregex_replaceFirstUText(URegularExpression *regexp, UText *replacement, UText *dest, UErrorCode *status); /** * Implements a replace operation intended to be used as part of an * incremental find-and-replace. * *

The input string, starting from the end of the previous match and ending at * the start of the current match, is appended to the destination string. Then the * replacement string is appended to the output string, * including handling any substitutions of captured text.

* *

A note on preflight computation of buffersize and error handling: * Calls to uregex_appendReplacement() and uregex_appendTail() are * designed to be chained, one after another, with the destination * buffer pointer and buffer capacity updated after each in preparation * to for the next. If the destination buffer is exhausted partway through such a * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal * ICU conventions are for a function to perform no action if it is * called with an error status, but for this one case, uregex_appendRepacement() * will operate normally so that buffer size computations will complete * correctly. * *

For simple, prepackaged, non-incremental find-and-replace * operations, see replaceFirst() or replaceAll().

* * @param regexp The regular expression object. * @param replacementText The string that will replace the matched portion of the * input string as it is copied to the destination buffer. * The replacement text may contain references ($1, for * example) to capture groups from the match. * @param replacementLength The length of the replacement text string, * or -1 if the string is NUL terminated. * @param destBuf The buffer into which the results of the * find-and-replace are placed. On return, this pointer * will be updated to refer to the beginning of the * unused portion of buffer, leaving it in position for * a subsequent call to this function. * @param destCapacity The size of the output buffer, On return, this * parameter will be updated to reflect the space remaining * unused in the output buffer. * @param status A reference to a UErrorCode to receive any errors. * @return The length of the result string. In the event that * destCapacity is inadequate, the full length of the * untruncated output string is returned. * * @stable ICU 3.0 * */ U_CAPI int32_t U_EXPORT2 uregex_appendReplacement(URegularExpression *regexp, const UChar *replacementText, int32_t replacementLength, UChar **destBuf, int32_t *destCapacity, UErrorCode *status); /** * Implements a replace operation intended to be used as part of an * incremental find-and-replace. * *

The input string, starting from the end of the previous match and ending at * the start of the current match, is appended to the destination string. Then the * replacement string is appended to the output string, * including handling any substitutions of captured text.

* *

For simple, prepackaged, non-incremental find-and-replace * operations, see replaceFirst() or replaceAll().

* * @param regexp The regular expression object. * @param replacementText The string that will replace the matched portion of the * input string as it is copied to the destination buffer. * The replacement text may contain references ($1, for * example) to capture groups from the match. * @param dest A mutable UText that will receive the result. Must not be NULL. * @param status A reference to a UErrorCode to receive any errors. * * @stable ICU 4.6 */ U_CAPI void U_EXPORT2 uregex_appendReplacementUText(URegularExpression *regexp, UText *replacementText, UText *dest, UErrorCode *status); /** * As the final step in a find-and-replace operation, append the remainder * of the input string, starting at the position following the last match, * to the destination string. uregex_appendTail() is intended * to be invoked after one or more invocations of the * uregex_appendReplacement() function. * * @param regexp The regular expression object. This is needed to * obtain the input string and with the position * of the last match within it. * @param destBuf The buffer in which the results of the * find-and-replace are placed. On return, the pointer * will be updated to refer to the beginning of the * unused portion of buffer. * @param destCapacity The size of the output buffer, On return, this * value will be updated to reflect the space remaining * unused in the output buffer. * @param status A reference to a UErrorCode to receive any errors. * @return The length of the result string. In the event that * destCapacity is inadequate, the full length of the * untruncated output string is returned. * * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_appendTail(URegularExpression *regexp, UChar **destBuf, int32_t *destCapacity, UErrorCode *status); /** * As the final step in a find-and-replace operation, append the remainder * of the input string, starting at the position following the last match, * to the destination string. uregex_appendTailUText() is intended * to be invoked after one or more invocations of the * uregex_appendReplacementUText() function. * * @param regexp The regular expression object. This is needed to * obtain the input string and with the position * of the last match within it. * @param dest A mutable UText that will receive the result. Must not be NULL. * * @param status Error code * * @return The destination UText. * * @stable ICU 4.6 */ U_CAPI UText * U_EXPORT2 uregex_appendTailUText(URegularExpression *regexp, UText *dest, UErrorCode *status); /** * Split a string into fields. Somewhat like split() from Perl. * The pattern matches identify delimiters that separate the input * into fields. The input data between the matches becomes the * fields themselves. * * Each of the fields is copied from the input string to the destination * buffer, and NUL terminated. The position of each field within * the destination buffer is returned in the destFields array. * * If the delimiter pattern includes capture groups, the captured text will * also appear in the destination array of output strings, interspersed * with the fields. This is similar to Perl, but differs from Java, * which ignores the presence of capture groups in the pattern. * * Trailing empty fields will always be returned, assuming sufficient * destination capacity. This differs from the default behavior for Java * and Perl where trailing empty fields are not returned. * * The number of strings produced by the split operation is returned. * This count includes the strings from capture groups in the delimiter pattern. * This behavior differs from Java, which ignores capture groups. * * @param regexp The compiled regular expression. * @param destBuf A (UChar *) buffer to receive the fields that * are extracted from the input string. These * field pointers will refer to positions within the * destination buffer supplied by the caller. Any * extra positions within the destFields array will be * set to NULL. * @param destCapacity The capacity of the destBuf. * @param requiredCapacity The actual capacity required of the destBuf. * If destCapacity is too small, requiredCapacity will return * the total capacity required to hold all of the output, and * a U_BUFFER_OVERFLOW_ERROR will be returned. * @param destFields An array to be filled with the position of each * of the extracted fields within destBuf. * @param destFieldsCapacity The number of elements in the destFields array. * If the number of fields found is less than destFieldsCapacity, * the extra destFields elements are set to zero. * If destFieldsCapacity is too small, the trailing part of the * input, including any field delimiters, is treated as if it * were the last field - it is copied to the destBuf, and * its position is in the destBuf is stored in the last element * of destFields. This behavior mimics that of Perl. It is not * an error condition, and no error status is returned when all destField * positions are used. * @param status A reference to a UErrorCode to receive any errors. * @return The number of fields into which the input string was split. * @stable ICU 3.0 */ U_CAPI int32_t U_EXPORT2 uregex_split( URegularExpression *regexp, UChar *destBuf, int32_t destCapacity, int32_t *requiredCapacity, UChar *destFields[], int32_t destFieldsCapacity, UErrorCode *status); /** * Split a string into fields. Somewhat like split() from Perl. * The pattern matches identify delimiters that separate the input * into fields. The input data between the matches becomes the * fields themselves. *

* The behavior of this function is not very closely aligned with uregex_split(); * instead, it is based on (and implemented directly on top of) the C++ split method. * * @param regexp The compiled regular expression. * @param destFields An array of mutable UText structs to receive the results of the split. * If a field is NULL, a new UText is allocated to contain the results for * that field. This new UText is not guaranteed to be mutable. * @param destFieldsCapacity The number of elements in the destination array. * If the number of fields found is less than destCapacity, the * extra strings in the destination array are not altered. * If the number of destination strings is less than the number * of fields, the trailing part of the input string, including any * field delimiters, is placed in the last destination string. * This behavior mimics that of Perl. It is not an error condition, and no * error status is returned when all destField positions are used. * @param status A reference to a UErrorCode to receive any errors. * @return The number of fields into which the input string was split. * * @stable ICU 4.6 */ U_CAPI int32_t U_EXPORT2 uregex_splitUText(URegularExpression *regexp, UText *destFields[], int32_t destFieldsCapacity, UErrorCode *status); /** * Set a processing time limit for match operations with this URegularExpression. * * Some patterns, when matching certain strings, can run in exponential time. * For practical purposes, the match operation may appear to be in an * infinite loop. * When a limit is set a match operation will fail with an error if the * limit is exceeded. *

* The units of the limit are steps of the match engine. * Correspondence with actual processor time will depend on the speed * of the processor and the details of the specific pattern, but will * typically be on the order of milliseconds. *

* By default, the matching time is not limited. *

* * @param regexp The compiled regular expression. * @param limit The limit value, or 0 for no limit. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.0 */ U_CAPI void U_EXPORT2 uregex_setTimeLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status); /** * Get the time limit for for matches with this URegularExpression. * A return value of zero indicates that there is no limit. * * @param regexp The compiled regular expression. * @param status A reference to a UErrorCode to receive any errors. * @return the maximum allowed time for a match, in units of processing steps. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 uregex_getTimeLimit(const URegularExpression *regexp, UErrorCode *status); /** * Set the amount of heap storage available for use by the match backtracking stack. *

* ICU uses a backtracking regular expression engine, with the backtrack stack * maintained on the heap. This function sets the limit to the amount of memory * that can be used for this purpose. A backtracking stack overflow will * result in an error from the match operation that caused it. *

* A limit is desirable because a malicious or poorly designed pattern can use * excessive memory, potentially crashing the process. A limit is enabled * by default. *

* @param regexp The compiled regular expression. * @param limit The maximum size, in bytes, of the matching backtrack stack. * A value of zero means no limit. * The limit must be greater than or equal to zero. * @param status A reference to a UErrorCode to receive any errors. * * @stable ICU 4.0 */ U_CAPI void U_EXPORT2 uregex_setStackLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status); /** * Get the size of the heap storage available for use by the back tracking stack. * * @return the maximum backtracking stack size, in bytes, or zero if the * stack size is unlimited. * @stable ICU 4.0 */ U_CAPI int32_t U_EXPORT2 uregex_getStackLimit(const URegularExpression *regexp, UErrorCode *status); /** * Function pointer for a regular expression matching callback function. * When set, a callback function will be called periodically during matching * operations. If the call back function returns false, the matching * operation will be terminated early. * * Note: the callback function must not call other functions on this * URegularExpression. * * @param context context pointer. The callback function will be invoked * with the context specified at the time that * uregex_setMatchCallback() is called. * @param steps the accumulated processing time, in match steps, * for this matching operation. * @return true to continue the matching operation. * false to terminate the matching operation. * @stable ICU 4.0 */ U_CDECL_BEGIN typedef UBool U_CALLCONV URegexMatchCallback ( const void *context, int32_t steps); U_CDECL_END /** * Set a callback function for this URegularExpression. * During matching operations the function will be called periodically, * giving the application the opportunity to terminate a long-running * match. * * @param regexp The compiled regular expression. * @param callback A pointer to the user-supplied callback function. * @param context User context pointer. The value supplied at the * time the callback function is set will be saved * and passed to the callback each time that it is called. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.0 */ U_CAPI void U_EXPORT2 uregex_setMatchCallback(URegularExpression *regexp, URegexMatchCallback *callback, const void *context, UErrorCode *status); /** * Get the callback function for this URegularExpression. * * @param regexp The compiled regular expression. * @param callback Out parameter, receives a pointer to the user-supplied * callback function. * @param context Out parameter, receives the user context pointer that * was set when uregex_setMatchCallback() was called. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.0 */ U_CAPI void U_EXPORT2 uregex_getMatchCallback(const URegularExpression *regexp, URegexMatchCallback **callback, const void **context, UErrorCode *status); /** * Function pointer for a regular expression find callback function. * * When set, a callback function will be called during a find operation * and for operations that depend on find, such as findNext, split and some replace * operations like replaceFirst. * The callback will usually be called after each attempt at a match, but this is not a * guarantee that the callback will be invoked at each character. For finds where the * match engine is invoked at each character, this may be close to true, but less likely * for more optimized loops where the pattern is known to only start, and the match * engine invoked, at certain characters. * When invoked, this callback will specify the index at which a match operation is about * to be attempted, giving the application the opportunity to terminate a long-running * find operation. * * If the call back function returns false, the find operation will be terminated early. * * Note: the callback function must not call other functions on this * URegularExpression * * @param context context pointer. The callback function will be invoked * with the context specified at the time that * uregex_setFindProgressCallback() is called. * @param matchIndex the next index at which a match attempt will be attempted for this * find operation. If this callback interrupts the search, this is the * index at which a find/findNext operation may be re-initiated. * @return true to continue the matching operation. * false to terminate the matching operation. * @stable ICU 4.6 */ U_CDECL_BEGIN typedef UBool U_CALLCONV URegexFindProgressCallback ( const void *context, int64_t matchIndex); U_CDECL_END /** * Set the find progress callback function for this URegularExpression. * * @param regexp The compiled regular expression. * @param callback A pointer to the user-supplied callback function. * @param context User context pointer. The value supplied at the * time the callback function is set will be saved * and passed to the callback each time that it is called. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.6 */ U_CAPI void U_EXPORT2 uregex_setFindProgressCallback(URegularExpression *regexp, URegexFindProgressCallback *callback, const void *context, UErrorCode *status); /** * Get the find progress callback function for this URegularExpression. * * @param regexp The compiled regular expression. * @param callback Out parameter, receives a pointer to the user-supplied * callback function. * @param context Out parameter, receives the user context pointer that * was set when uregex_setFindProgressCallback() was called. * @param status A reference to a UErrorCode to receive any errors. * @stable ICU 4.6 */ U_CAPI void U_EXPORT2 uregex_getFindProgressCallback(const URegularExpression *regexp, URegexFindProgressCallback **callback, const void **context, UErrorCode *status); #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ #endif /* UREGEX_H */ // uregion.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2014, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef UREGION_H #define UREGION_H /** * \file * \brief C API: URegion (territory containment and mapping) * * URegion objects represent data associated with a particular Unicode Region Code, also known as a * Unicode Region Subtag, which is defined based upon the BCP 47 standard. These include: * * Two-letter codes defined by ISO 3166-1, with special LDML treatment of certain private-use or * reserved codes; * * A subset of 3-digit numeric codes defined by UN M.49. * URegion objects can also provide mappings to and from additional codes. There are different types * of regions that are important to distinguish: *

* Macroregion - A code for a "macro geographical (continental) region, geographical sub-region, or * selected economic and other grouping" as defined in UN M.49. These are typically 3-digit codes, * but contain some 2-letter codes for LDML extensions, such as "QO" for Outlying Oceania. * Macroregions are represented in ICU by one of three region types: WORLD (code 001), * CONTINENTS (regions contained directly by WORLD), and SUBCONTINENTS (regions contained directly * by a continent ). *

* TERRITORY - A Region that is not a Macroregion. These are typically codes for countries, but also * include areas that are not separate countries, such as the code "AQ" for Antarctica or the code * "HK" for Hong Kong (SAR China). Overseas dependencies of countries may or may not have separate * codes. The codes are typically 2-letter codes aligned with ISO 3166, but BCP47 allows for the use * of 3-digit codes in the future. *

* UNKNOWN - The code ZZ is defined by Unicode LDML for use in indicating that region is unknown, * or that the value supplied as a region was invalid. *

* DEPRECATED - Region codes that have been defined in the past but are no longer in modern usage, * usually due to a country splitting into multiple territories or changing its name. *

* GROUPING - A widely understood grouping of territories that has a well defined membership such * that a region code has been assigned for it. Some of these are UN M.49 codes that don't fall into * the world/continent/sub-continent hierarchy, while others are just well-known groupings that have * their own region code. Region "EU" (European Union) is one such region code that is a grouping. * Groupings will never be returned by the uregion_getContainingRegion, since a different type of region * (WORLD, CONTINENT, or SUBCONTINENT) will always be the containing region instead. * * URegion objects are const/immutable, owned and maintained by ICU itself, so there are not functions * to open or close them. */ /** * URegionType is an enumeration defining the different types of regions. Current possible * values are URGN_WORLD, URGN_CONTINENT, URGN_SUBCONTINENT, URGN_TERRITORY, URGN_GROUPING, * URGN_DEPRECATED, and URGN_UNKNOWN. * * @stable ICU 51 */ typedef enum URegionType { /** * Type representing the unknown region. * @stable ICU 51 */ URGN_UNKNOWN, /** * Type representing a territory. * @stable ICU 51 */ URGN_TERRITORY, /** * Type representing the whole world. * @stable ICU 51 */ URGN_WORLD, /** * Type representing a continent. * @stable ICU 51 */ URGN_CONTINENT, /** * Type representing a sub-continent. * @stable ICU 51 */ URGN_SUBCONTINENT, /** * Type representing a grouping of territories that is not to be used in * the normal WORLD/CONTINENT/SUBCONTINENT/TERRITORY containment tree. * @stable ICU 51 */ URGN_GROUPING, /** * Type representing a region whose code has been deprecated, usually * due to a country splitting into multiple territories or changing its name. * @stable ICU 51 */ URGN_DEPRECATED, } URegionType; #if !UCONFIG_NO_FORMATTING /** * Opaque URegion object for use in C programs. * @stable ICU 52 */ struct URegion; typedef struct URegion URegion; /**< @stable ICU 52 */ /** * Returns a pointer to a URegion for the specified region code: A 2-letter or 3-letter ISO 3166 * code, UN M.49 numeric code (superset of ISO 3166 numeric codes), or other valid Unicode Region * Code as defined by the LDML specification. The code will be canonicalized internally. If the * region code is NULL or not recognized, the appropriate error code will be set * (U_ILLEGAL_ARGUMENT_ERROR). * @stable ICU 52 */ U_CAPI const URegion* U_EXPORT2 uregion_getRegionFromCode(const char *regionCode, UErrorCode *status); /** * Returns a pointer to a URegion for the specified numeric region code. If the numeric region * code is not recognized, the appropriate error code will be set (U_ILLEGAL_ARGUMENT_ERROR). * @stable ICU 52 */ U_CAPI const URegion* U_EXPORT2 uregion_getRegionFromNumericCode (int32_t code, UErrorCode *status); /** * Returns an enumeration over the canonical codes of all known regions that match the given type. * The enumeration must be closed with with uenum_close(). * @stable ICU 52 */ U_CAPI UEnumeration* U_EXPORT2 uregion_getAvailable(URegionType type, UErrorCode *status); /** * Returns true if the specified uregion is equal to the specified otherRegion. * @stable ICU 52 */ U_CAPI UBool U_EXPORT2 uregion_areEqual(const URegion* uregion, const URegion* otherRegion); /** * Returns a pointer to the URegion that contains the specified uregion. Returns NULL if the * specified uregion is code "001" (World) or "ZZ" (Unknown region). For example, calling * this method with region "IT" (Italy) returns the URegion for "039" (Southern Europe). * @stable ICU 52 */ U_CAPI const URegion* U_EXPORT2 uregion_getContainingRegion(const URegion* uregion); /** * Return a pointer to the URegion that geographically contains this uregion and matches the * specified type, moving multiple steps up the containment chain if necessary. Returns NULL if no * containing region can be found that matches the specified type. Will return NULL if URegionType * is URGN_GROUPING, URGN_DEPRECATED, or URGN_UNKNOWN which are not appropriate for this API. * For example, calling this method with uregion "IT" (Italy) for type URGN_CONTINENT returns the * URegion "150" (Europe). * @stable ICU 52 */ U_CAPI const URegion* U_EXPORT2 uregion_getContainingRegionOfType(const URegion* uregion, URegionType type); /** * Return an enumeration over the canonical codes of all the regions that are immediate children * of the specified uregion in the region hierarchy. These returned regions could be either macro * regions, territories, or a mixture of the two, depending on the containment data as defined in * CLDR. This API returns NULL if this uregion doesn't have any sub-regions. For example, calling * this function for uregion "150" (Europe) returns an enumeration containing the various * sub-regions of Europe: "039" (Southern Europe), "151" (Eastern Europe), "154" (Northern Europe), * and "155" (Western Europe). The enumeration must be closed with with uenum_close(). * @stable ICU 52 */ U_CAPI UEnumeration* U_EXPORT2 uregion_getContainedRegions(const URegion* uregion, UErrorCode *status); /** * Returns an enumeration over the canonical codes of all the regions that are children of the * specified uregion anywhere in the region hierarchy and match the given type. This API may return * an empty enumeration if this uregion doesn't have any sub-regions that match the given type. * For example, calling this method with region "150" (Europe) and type URGN_TERRITORY" returns an * enumeration containing all the territories in Europe: "FR" (France), "IT" (Italy), "DE" (Germany), * etc. The enumeration must be closed with with uenum_close(). * @stable ICU 52 */ U_CAPI UEnumeration* U_EXPORT2 uregion_getContainedRegionsOfType(const URegion* uregion, URegionType type, UErrorCode *status); /** * Returns true if the specified uregion contains the specified otherRegion anywhere in the region * hierarchy. * @stable ICU 52 */ U_CAPI UBool U_EXPORT2 uregion_contains(const URegion* uregion, const URegion* otherRegion); /** * If the specified uregion is deprecated, returns an enumeration over the canonical codes of the * regions that are the preferred replacement regions for the specified uregion. If the specified * uregion is not deprecated, returns NULL. For example, calling this method with uregion * "SU" (Soviet Union) returns a list of the regions containing "RU" (Russia), "AM" (Armenia), * "AZ" (Azerbaijan), etc... The enumeration must be closed with with uenum_close(). * @stable ICU 52 */ U_CAPI UEnumeration* U_EXPORT2 uregion_getPreferredValues(const URegion* uregion, UErrorCode *status); /** * Returns the specified uregion's canonical code. * @stable ICU 52 */ U_CAPI const char* U_EXPORT2 uregion_getRegionCode(const URegion* uregion); /** * Returns the specified uregion's numeric code, or a negative value if there is no numeric code * for the specified uregion. * @stable ICU 52 */ U_CAPI int32_t U_EXPORT2 uregion_getNumericCode(const URegion* uregion); /** * Returns the URegionType of the specified uregion. * @stable ICU 52 */ U_CAPI URegionType U_EXPORT2 uregion_getType(const URegion* uregion); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // ureldatefmt.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2016, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef URELDATEFMT_H #define URELDATEFMT_H #if !UCONFIG_NO_FORMATTING && !UCONFIG_NO_BREAK_ITERATION /** * \file * \brief C API: URelativeDateTimeFormatter, relative date formatting of unit + numeric offset. * * Provides simple formatting of relative dates, in two ways *

    *
  • relative dates with a quantity e.g "in 5 days"
  • *
  • relative dates without a quantity e.g "next Tuesday"
  • *
*

* This does not provide compound formatting for multiple units, * other than the ability to combine a time string with a relative date, * as in "next Tuesday at 3:45 PM". It also does not provide support * for determining which unit to use, such as deciding between "in 7 days" * and "in 1 week". * * @stable ICU 57 */ /** * The formatting style * @stable ICU 54 */ typedef enum UDateRelativeDateTimeFormatterStyle { /** * Everything spelled out. * @stable ICU 54 */ UDAT_STYLE_LONG, /** * Abbreviations used when possible. * @stable ICU 54 */ UDAT_STYLE_SHORT, /** * Use the shortest possible form. * @stable ICU 54 */ UDAT_STYLE_NARROW, } UDateRelativeDateTimeFormatterStyle; /** * Represents the unit for formatting a relative date. e.g "in 5 days" * or "next year" * @stable ICU 57 */ typedef enum URelativeDateTimeUnit { /** * Specifies that relative unit is year, e.g. "last year", * "in 5 years". * @stable ICU 57 */ UDAT_REL_UNIT_YEAR, /** * Specifies that relative unit is quarter, e.g. "last quarter", * "in 5 quarters". * @stable ICU 57 */ UDAT_REL_UNIT_QUARTER, /** * Specifies that relative unit is month, e.g. "last month", * "in 5 months". * @stable ICU 57 */ UDAT_REL_UNIT_MONTH, /** * Specifies that relative unit is week, e.g. "last week", * "in 5 weeks". * @stable ICU 57 */ UDAT_REL_UNIT_WEEK, /** * Specifies that relative unit is day, e.g. "yesterday", * "in 5 days". * @stable ICU 57 */ UDAT_REL_UNIT_DAY, /** * Specifies that relative unit is hour, e.g. "1 hour ago", * "in 5 hours". * @stable ICU 57 */ UDAT_REL_UNIT_HOUR, /** * Specifies that relative unit is minute, e.g. "1 minute ago", * "in 5 minutes". * @stable ICU 57 */ UDAT_REL_UNIT_MINUTE, /** * Specifies that relative unit is second, e.g. "1 second ago", * "in 5 seconds". * @stable ICU 57 */ UDAT_REL_UNIT_SECOND, /** * Specifies that relative unit is Sunday, e.g. "last Sunday", * "this Sunday", "next Sunday", "in 5 Sundays". * @stable ICU 57 */ UDAT_REL_UNIT_SUNDAY, /** * Specifies that relative unit is Monday, e.g. "last Monday", * "this Monday", "next Monday", "in 5 Mondays". * @stable ICU 57 */ UDAT_REL_UNIT_MONDAY, /** * Specifies that relative unit is Tuesday, e.g. "last Tuesday", * "this Tuesday", "next Tuesday", "in 5 Tuesdays". * @stable ICU 57 */ UDAT_REL_UNIT_TUESDAY, /** * Specifies that relative unit is Wednesday, e.g. "last Wednesday", * "this Wednesday", "next Wednesday", "in 5 Wednesdays". * @stable ICU 57 */ UDAT_REL_UNIT_WEDNESDAY, /** * Specifies that relative unit is Thursday, e.g. "last Thursday", * "this Thursday", "next Thursday", "in 5 Thursdays". * @stable ICU 57 */ UDAT_REL_UNIT_THURSDAY, /** * Specifies that relative unit is Friday, e.g. "last Friday", * "this Friday", "next Friday", "in 5 Fridays". * @stable ICU 57 */ UDAT_REL_UNIT_FRIDAY, /** * Specifies that relative unit is Saturday, e.g. "last Saturday", * "this Saturday", "next Saturday", "in 5 Saturdays". * @stable ICU 57 */ UDAT_REL_UNIT_SATURDAY, } URelativeDateTimeUnit; #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * FieldPosition and UFieldPosition selectors for format fields * defined by RelativeDateTimeFormatter. * @stable ICU 64 */ typedef enum URelativeDateTimeFormatterField { /** * Represents a literal text string, like "tomorrow" or "days ago". * @stable ICU 64 */ UDAT_REL_LITERAL_FIELD, /** * Represents a number quantity, like "3" in "3 days ago". * @stable ICU 64 */ UDAT_REL_NUMERIC_FIELD, } URelativeDateTimeFormatterField; #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Opaque URelativeDateTimeFormatter object for use in C programs. * @stable ICU 57 */ struct URelativeDateTimeFormatter; typedef struct URelativeDateTimeFormatter URelativeDateTimeFormatter; /**< C typedef for struct URelativeDateTimeFormatter. @stable ICU 57 */ /** * Open a new URelativeDateTimeFormatter object for a given locale using the * specified width and capitalizationContext, along with a number formatter * (if desired) to override the default formatter that would be used for * display of numeric field offsets. The default formatter typically rounds * toward 0 and has a minimum of 0 fraction digits and a maximum of 3 * fraction digits (i.e. it will show as many decimal places as necessary * up to 3, without showing trailing 0s). * * @param locale * The locale * @param nfToAdopt * A number formatter to set for this URelativeDateTimeFormatter * object (instead of the default decimal formatter). Ownership of * this UNumberFormat object will pass to the URelativeDateTimeFormatter * object (the URelativeDateTimeFormatter adopts the UNumberFormat), * which becomes responsible for closing it. If the caller wishes to * retain ownership of the UNumberFormat object, the caller must clone * it (with unum_clone) and pass the clone to ureldatefmt_open. May be * NULL to use the default decimal formatter. * @param width * The width - wide, short, narrow, etc. * @param capitalizationContext * A value from UDisplayContext that pertains to capitalization, e.g. * UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE. * @param status * A pointer to a UErrorCode to receive any errors. * @return * A pointer to a URelativeDateTimeFormatter object for the specified locale, * or NULL if an error occurred. * @stable ICU 57 */ U_CAPI URelativeDateTimeFormatter* U_EXPORT2 ureldatefmt_open( const char* locale, UNumberFormat* nfToAdopt, UDateRelativeDateTimeFormatterStyle width, UDisplayContext capitalizationContext, UErrorCode* status ); /** * Close a URelativeDateTimeFormatter object. Once closed it may no longer be used. * @param reldatefmt * The URelativeDateTimeFormatter object to close. * @stable ICU 57 */ U_CAPI void U_EXPORT2 ureldatefmt_close(URelativeDateTimeFormatter *reldatefmt); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) struct UFormattedRelativeDateTime; /** * Opaque struct to contain the results of a URelativeDateTimeFormatter operation. * @stable ICU 64 */ typedef struct UFormattedRelativeDateTime UFormattedRelativeDateTime; /** * Creates an object to hold the result of a URelativeDateTimeFormatter * operation. The object can be used repeatedly; it is cleared whenever * passed to a format function. * * @param ec Set if an error occurs. * @return A pointer needing ownership. * @stable ICU 64 */ U_CAPI UFormattedRelativeDateTime* U_EXPORT2 ureldatefmt_openResult(UErrorCode* ec); /** * Returns a representation of a UFormattedRelativeDateTime as a UFormattedValue, * which can be subsequently passed to any API requiring that type. * * The returned object is owned by the UFormattedRelativeDateTime and is valid * only as long as the UFormattedRelativeDateTime is present and unchanged in memory. * * You can think of this method as a cast between types. * * @param ufrdt The object containing the formatted string. * @param ec Set if an error occurs. * @return A UFormattedValue owned by the input object. * @stable ICU 64 */ U_CAPI const UFormattedValue* U_EXPORT2 ureldatefmt_resultAsValue(const UFormattedRelativeDateTime* ufrdt, UErrorCode* ec); /** * Releases the UFormattedRelativeDateTime created by ureldatefmt_openResult. * * @param ufrdt The object to release. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ureldatefmt_closeResult(UFormattedRelativeDateTime* ufrdt); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Format a combination of URelativeDateTimeUnit and numeric * offset using a numeric style, e.g. "1 week ago", "in 1 week", * "5 weeks ago", "in 5 weeks". * * @param reldatefmt * The URelativeDateTimeFormatter object specifying the * format conventions. * @param offset * The signed offset for the specified unit. This will * be formatted according to this object's UNumberFormat * object. * @param unit * The unit to use when formatting the relative * date, e.g. UDAT_REL_UNIT_WEEK, UDAT_REL_UNIT_FRIDAY. * @param result * A pointer to a buffer to receive the formatted result. * @param resultCapacity * The maximum size of result. * @param status * A pointer to a UErrorCode to receive any errors. In * case of error status, the contents of result are * undefined. * @return * The length of the formatted result; may be greater * than resultCapacity, in which case an error is returned. * @stable ICU 57 */ U_CAPI int32_t U_EXPORT2 ureldatefmt_formatNumeric( const URelativeDateTimeFormatter* reldatefmt, double offset, URelativeDateTimeUnit unit, UChar* result, int32_t resultCapacity, UErrorCode* status); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Format a combination of URelativeDateTimeUnit and numeric * offset using a numeric style, e.g. "1 week ago", "in 1 week", * "5 weeks ago", "in 5 weeks". * * @param reldatefmt * The URelativeDateTimeFormatter object specifying the * format conventions. * @param offset * The signed offset for the specified unit. This will * be formatted according to this object's UNumberFormat * object. * @param unit * The unit to use when formatting the relative * date, e.g. UDAT_REL_UNIT_WEEK, UDAT_REL_UNIT_FRIDAY. * @param result * A pointer to a UFormattedRelativeDateTime to populate. * @param status * A pointer to a UErrorCode to receive any errors. In * case of error status, the contents of result are * undefined. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ureldatefmt_formatNumericToResult( const URelativeDateTimeFormatter* reldatefmt, double offset, URelativeDateTimeUnit unit, UFormattedRelativeDateTime* result, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Format a combination of URelativeDateTimeUnit and numeric offset * using a text style if possible, e.g. "last week", "this week", * "next week", "yesterday", "tomorrow". Falls back to numeric * style if no appropriate text term is available for the specified * offset in the object's locale. * * @param reldatefmt * The URelativeDateTimeFormatter object specifying the * format conventions. * @param offset * The signed offset for the specified unit. * @param unit * The unit to use when formatting the relative * date, e.g. UDAT_REL_UNIT_WEEK, UDAT_REL_UNIT_FRIDAY. * @param result * A pointer to a buffer to receive the formatted result. * @param resultCapacity * The maximum size of result. * @param status * A pointer to a UErrorCode to receive any errors. In * case of error status, the contents of result are * undefined. * @return * The length of the formatted result; may be greater * than resultCapacity, in which case an error is returned. * @stable ICU 57 */ U_CAPI int32_t U_EXPORT2 ureldatefmt_format( const URelativeDateTimeFormatter* reldatefmt, double offset, URelativeDateTimeUnit unit, UChar* result, int32_t resultCapacity, UErrorCode* status); #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Format a combination of URelativeDateTimeUnit and numeric offset * using a text style if possible, e.g. "last week", "this week", * "next week", "yesterday", "tomorrow". Falls back to numeric * style if no appropriate text term is available for the specified * offset in the object's locale. * * This method populates a UFormattedRelativeDateTime, which exposes more * information than the string populated by format(). * * @param reldatefmt * The URelativeDateTimeFormatter object specifying the * format conventions. * @param offset * The signed offset for the specified unit. * @param unit * The unit to use when formatting the relative * date, e.g. UDAT_REL_UNIT_WEEK, UDAT_REL_UNIT_FRIDAY. * @param result * A pointer to a UFormattedRelativeDateTime to populate. * @param status * A pointer to a UErrorCode to receive any errors. In * case of error status, the contents of result are * undefined. * @stable ICU 64 */ U_CAPI void U_EXPORT2 ureldatefmt_formatToResult( const URelativeDateTimeFormatter* reldatefmt, double offset, URelativeDateTimeUnit unit, UFormattedRelativeDateTime* result, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Combines a relative date string and a time string in this object's * locale. This is done with the same date-time separator used for the * default calendar in this locale to produce a result such as * "yesterday at 3:45 PM". * * @param reldatefmt * The URelativeDateTimeFormatter object specifying the format conventions. * @param relativeDateString * The relative date string. * @param relativeDateStringLen * The length of relativeDateString; may be -1 if relativeDateString * is zero-terminated. * @param timeString * The time string. * @param timeStringLen * The length of timeString; may be -1 if timeString is zero-terminated. * @param result * A pointer to a buffer to receive the formatted result. * @param resultCapacity * The maximum size of result. * @param status * A pointer to a UErrorCode to receive any errors. In case of error status, * the contents of result are undefined. * @return * The length of the formatted result; may be greater than resultCapacity, * in which case an error is returned. * @stable ICU 57 */ U_CAPI int32_t U_EXPORT2 ureldatefmt_combineDateAndTime( const URelativeDateTimeFormatter* reldatefmt, const UChar * relativeDateString, int32_t relativeDateStringLen, const UChar * timeString, int32_t timeStringLen, UChar* result, int32_t resultCapacity, UErrorCode* status ); #endif /* !UCONFIG_NO_FORMATTING && !UCONFIG_NO_BREAK_ITERATION */ #endif // usearch.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2011,2014 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 06/28/2001 synwee Creation. ********************************************************************** */ #ifndef USEARCH_H #define USEARCH_H #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION /** * \file * \brief C API: StringSearch * * C APIs for an engine that provides language-sensitive text searching based * on the comparison rules defined in a UCollator data struct, * see ucol.h. This ensures that language eccentricity can be * handled, e.g. for the German collator, characters ß and SS will be matched * if case is chosen to be ignored. * See the * "ICU Collation Design Document" for more information. *

* As of ICU4C 4.0 / ICU4J 53, the implementation uses a linear search. In previous versions, * a modified form of the Boyer-Moore searching algorithm was used. For more information * on the modified Boyer-Moore algorithm see * * "Efficient Text Searching in Java", published in Java Report * in February, 1999. *

* There are 2 match options for selection:
* Let S' be the sub-string of a text string S between the offsets start and * end . *
* A pattern string P matches a text string S at the offsets * if *

 
 * option 1. Some canonical equivalent of P matches some canonical equivalent 
 *           of S'
 * option 2. P matches S' and if P starts or ends with a combining mark, 
 *           there exists no non-ignorable combining mark before or after S' 
 *           in S respectively. 
 * 
* Option 2. will be the default. *

* This search has APIs similar to that of other text iteration mechanisms * such as the break iterators in ubrk.h. Using these * APIs, it is easy to scan through text looking for all occurrences of * a given pattern. This search iterator allows changing of direction by * calling a reset followed by a next or previous. * Though a direction change can occur without calling reset first, * this operation comes with some speed penalty. * Generally, match results in the forward direction will match the result * matches in the backwards direction in the reverse order *

* usearch.h provides APIs to specify the starting position * within the text string to be searched, e.g. usearch_setOffset, * usearch_preceding and usearch_following. Since the * starting position will be set as it is specified, please take note that * there are some dangerous positions which the search may render incorrect * results: *

    *
  • The midst of a substring that requires normalization. *
  • If the following match is to be found, the position should not be the * second character which requires to be swapped with the preceding * character. Vice versa, if the preceding match is to be found, * position to search from should not be the first character which * requires to be swapped with the next character. E.g certain Thai and * Lao characters require swapping. *
  • If a following pattern match is to be found, any position within a * contracting sequence except the first will fail. Vice versa if a * preceding pattern match is to be found, a invalid starting point * would be any character within a contracting sequence except the last. *
*

* A breakiterator can be used if only matches at logical breaks are desired. * Using a breakiterator will only give you results that exactly matches the * boundaries given by the breakiterator. For instance the pattern "e" will * not be found in the string "\u00e9" if a character break iterator is used. *

* Options are provided to handle overlapping matches. * E.g. In English, overlapping matches produces the result 0 and 2 * for the pattern "abab" in the text "ababab", where else mutually * exclusive matches only produce the result of 0. *

* Options are also provided to implement "asymmetric search" as described in * * UTS #10 Unicode Collation Algorithm, specifically the USearchAttribute * USEARCH_ELEMENT_COMPARISON and its values. *

* Though collator attributes will be taken into consideration while * performing matches, there are no APIs here for setting and getting the * attributes. These attributes can be set by getting the collator * from usearch_getCollator and using the APIs in ucol.h. * Lastly to update String Search to the new collator attributes, * usearch_reset() has to be called. *

* Restriction:
* Currently there are no composite characters that consists of a * character with combining class > 0 before a character with combining * class == 0. However, if such a character exists in the future, the * search mechanism does not guarantee the results for option 1. * *

* Example of use:
*


 * char *tgtstr = "The quick brown fox jumped over the lazy fox";
 * char *patstr = "fox";
 * UChar target[64];
 * UChar pattern[16];
 * UErrorCode status = U_ZERO_ERROR;
 * u_uastrcpy(target, tgtstr);
 * u_uastrcpy(pattern, patstr);
 *
 * UStringSearch *search = usearch_open(pattern, -1, target, -1, "en_US", 
 *                                  NULL, &status);
 * if (U_SUCCESS(status)) {
 *     for (int pos = usearch_first(search, &status); 
 *          pos != USEARCH_DONE; 
 *          pos = usearch_next(search, &status))
 *     {
 *         printf("Found match at %d pos, length is %d\n", pos, 
 *                                        usearch_getMatchedLength(search));
 *     }
 * }
 *
 * usearch_close(search);
 * 
* @stable ICU 2.4 */ /** * DONE is returned by previous() and next() after all valid matches have * been returned, and by first() and last() if there are no matches at all. * @stable ICU 2.4 */ #define USEARCH_DONE -1 /** * Data structure for searching * @stable ICU 2.4 */ struct UStringSearch; /** * Data structure for searching * @stable ICU 2.4 */ typedef struct UStringSearch UStringSearch; /** * @stable ICU 2.4 */ typedef enum { /** * Option for overlapping matches * @stable ICU 2.4 */ USEARCH_OVERLAP = 0, /** * Option to control how collation elements are compared. * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON. * @stable ICU 4.4 */ USEARCH_ELEMENT_COMPARISON = 2, } USearchAttribute; /** * @stable ICU 2.4 */ typedef enum { /** * Default value for any USearchAttribute * @stable ICU 2.4 */ USEARCH_DEFAULT = -1, /** * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH * @stable ICU 2.4 */ USEARCH_OFF, /** * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH * @stable ICU 2.4 */ USEARCH_ON, /** * Value (default) for USEARCH_ELEMENT_COMPARISON; * standard collation element comparison at the specified collator * strength. * @stable ICU 4.4 */ USEARCH_STANDARD_ELEMENT_COMPARISON, /** * Value for USEARCH_ELEMENT_COMPARISON; * collation element comparison is modified to effectively provide * behavior between the specified strength and strength - 1. Collation * elements in the pattern that have the base weight for the specified * strength are treated as "wildcards" that match an element with any * other weight at that collation level in the searched text. For * example, with a secondary-strength English collator, a plain 'e' in * the pattern will match a plain e or an e with any diacritic in the * searched text, but an e with diacritic in the pattern will only * match an e with the same diacritic in the searched text. * * This supports "asymmetric search" as described in * * UTS #10 Unicode Collation Algorithm. * * @stable ICU 4.4 */ USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD, /** * Value for USEARCH_ELEMENT_COMPARISON. * collation element comparison is modified to effectively provide * behavior between the specified strength and strength - 1. Collation * elements in either the pattern or the searched text that have the * base weight for the specified strength are treated as "wildcards" * that match an element with any other weight at that collation level. * For example, with a secondary-strength English collator, a plain 'e' * in the pattern will match a plain e or an e with any diacritic in the * searched text, but an e with diacritic in the pattern will only * match an e with the same diacritic or a plain e in the searched text. * * This option is similar to "asymmetric search" as described in * [UTS #10 Unicode Collation Algorithm](http://www.unicode.org/reports/tr10/#Asymmetric_Search), * but also allows unmarked characters in the searched text to match * marked or unmarked versions of that character in the pattern. * * @stable ICU 4.4 */ USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD, } USearchAttributeValue; /* open and close ------------------------------------------------------ */ /** * Creates a String Search iterator data struct using the argument locale language * rule set. A collator will be created in the process, which will be owned by * this String Search and will be deleted in usearch_close. * * The UStringSearch retains a pointer to both the pattern and text strings. * The caller must not modify or delete them while using the UStringSearch. * * @param pattern for matching * @param patternlength length of the pattern, -1 for null-termination * @param text text string * @param textlength length of the text string, -1 for null-termination * @param locale name of locale for the rules to be used * @param breakiter A BreakIterator that will be used to restrict the points * at which matches are detected. If a match is found, but * the match's start or end index is not a boundary as * determined by the BreakIterator, the match will * be rejected and another will be searched for. * If this parameter is NULL, no break detection is * attempted. * @param status for errors if it occurs. If pattern or text is NULL, or if * patternlength or textlength is 0 then an * U_ILLEGAL_ARGUMENT_ERROR is returned. * @return search iterator data structure, or NULL if there is an error. * @stable ICU 2.4 */ U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, int32_t patternlength, const UChar *text, int32_t textlength, const char *locale, UBreakIterator *breakiter, UErrorCode *status); /** * Creates a String Search iterator data struct using the argument collator language * rule set. Note, user retains the ownership of this collator, thus the * responsibility of deletion lies with the user. * NOTE: String Search cannot be instantiated from a collator that has * collate digits as numbers (CODAN) turned on (UCOL_NUMERIC_COLLATION). * * The UStringSearch retains a pointer to both the pattern and text strings. * The caller must not modify or delete them while using the UStringSearch. * * @param pattern for matching * @param patternlength length of the pattern, -1 for null-termination * @param text text string * @param textlength length of the text string, -1 for null-termination * @param collator used for the language rules * @param breakiter A BreakIterator that will be used to restrict the points * at which matches are detected. If a match is found, but * the match's start or end index is not a boundary as * determined by the BreakIterator, the match will * be rejected and another will be searched for. * If this parameter is NULL, no break detection is * attempted. * @param status for errors if it occurs. If collator, pattern or text is NULL, * or if patternlength or textlength is 0 then an * U_ILLEGAL_ARGUMENT_ERROR is returned. * @return search iterator data structure, or NULL if there is an error. * @stable ICU 2.4 */ U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( const UChar *pattern, int32_t patternlength, const UChar *text, int32_t textlength, const UCollator *collator, UBreakIterator *breakiter, UErrorCode *status); /** * Destroys and cleans up the String Search iterator data struct. * If a collator was created in usearch_open, then it will be destroyed here. * @param searchiter The UStringSearch to clean up * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 usearch_close(UStringSearch *searchiter); /* get and set methods -------------------------------------------------- */ /** * Sets the current position in the text string which the next search will * start from. Clears previous states. * This method takes the argument index and sets the position in the text * string accordingly without checking if the index is pointing to a * valid starting point to begin searching. * Search positions that may render incorrect results are highlighted in the * header comments * @param strsrch search iterator data struct * @param position position to start next search from. If position is less * than or greater than the text range for searching, * an U_INDEX_OUTOFBOUNDS_ERROR will be returned * @param status error status if any. * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, int32_t position, UErrorCode *status); /** * Return the current index in the string text being searched. * If the iteration has gone past the end of the text (or past the beginning * for a backwards search), USEARCH_DONE is returned. * @param strsrch search iterator data struct * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch); /** * Sets the text searching attributes located in the enum USearchAttribute * with values from the enum USearchAttributeValue. * USEARCH_DEFAULT can be used for all attributes for resetting. * @param strsrch search iterator data struct * @param attribute text attribute to be set * @param value text attribute value * @param status for errors if it occurs * @see #usearch_getAttribute * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, USearchAttribute attribute, USearchAttributeValue value, UErrorCode *status); /** * Gets the text searching attributes. * @param strsrch search iterator data struct * @param attribute text attribute to be retrieve * @return text attribute value * @see #usearch_setAttribute * @stable ICU 2.4 */ U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( const UStringSearch *strsrch, USearchAttribute attribute); /** * Returns the index to the match in the text string that was searched. * This call returns a valid result only after a successful call to * usearch_first, usearch_next, usearch_previous, * or usearch_last. * Just after construction, or after a searching method returns * USEARCH_DONE, this method will return USEARCH_DONE. *

* Use usearch_getMatchedLength to get the matched string length. * @param strsrch search iterator data struct * @return index to a substring within the text string that is being * searched. * @see #usearch_first * @see #usearch_next * @see #usearch_previous * @see #usearch_last * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( const UStringSearch *strsrch); /** * Returns the length of text in the string which matches the search pattern. * This call returns a valid result only after a successful call to * usearch_first, usearch_next, usearch_previous, * or usearch_last. * Just after construction, or after a searching method returns * USEARCH_DONE, this method will return 0. * @param strsrch search iterator data struct * @return The length of the match in the string text, or 0 if there is no * match currently. * @see #usearch_first * @see #usearch_next * @see #usearch_previous * @see #usearch_last * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( const UStringSearch *strsrch); /** * Returns the text that was matched by the most recent call to * usearch_first, usearch_next, usearch_previous, * or usearch_last. * If the iterator is not pointing at a valid match (e.g. just after * construction or after USEARCH_DONE has been returned, returns * an empty string. If result is not large enough to store the matched text, * result will be filled with the partial text and an U_BUFFER_OVERFLOW_ERROR * will be returned in status. result will be null-terminated whenever * possible. If the buffer fits the matched text exactly, a null-termination * is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status. * Pre-flighting can be either done with length = 0 or the API * usearch_getMatchedLength. * @param strsrch search iterator data struct * @param result UChar buffer to store the matched string * @param resultCapacity length of the result buffer * @param status error returned if result is not large enough * @return exact length of the matched text, not counting the null-termination * @see #usearch_first * @see #usearch_next * @see #usearch_previous * @see #usearch_last * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, UChar *result, int32_t resultCapacity, UErrorCode *status); #if !UCONFIG_NO_BREAK_ITERATION /** * Set the BreakIterator that will be used to restrict the points at which * matches are detected. * @param strsrch search iterator data struct * @param breakiter A BreakIterator that will be used to restrict the points * at which matches are detected. If a match is found, but * the match's start or end index is not a boundary as * determined by the BreakIterator, the match will * be rejected and another will be searched for. * If this parameter is NULL, no break detection is * attempted. * @param status for errors if it occurs * @see #usearch_getBreakIterator * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, UBreakIterator *breakiter, UErrorCode *status); /** * Returns the BreakIterator that is used to restrict the points at which * matches are detected. This will be the same object that was passed to the * constructor or to usearch_setBreakIterator. Note that * NULL * is a legal value; it means that break detection should not be attempted. * @param strsrch search iterator data struct * @return break iterator used * @see #usearch_setBreakIterator * @stable ICU 2.4 */ U_CAPI const UBreakIterator * U_EXPORT2 usearch_getBreakIterator( const UStringSearch *strsrch); #endif /** * Set the string text to be searched. Text iteration will hence begin at the * start of the text string. This method is useful if you want to re-use an * iterator to search for the same pattern within a different body of text. * * The UStringSearch retains a pointer to the text string. The caller must not * modify or delete the string while using the UStringSearch. * * @param strsrch search iterator data struct * @param text new string to look for match * @param textlength length of the new string, -1 for null-termination * @param status for errors if it occurs. If text is NULL, or textlength is 0 * then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change * done to strsrch. * @see #usearch_getText * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, const UChar *text, int32_t textlength, UErrorCode *status); /** * Return the string text to be searched. * @param strsrch search iterator data struct * @param length returned string text length * @return string text * @see #usearch_setText * @stable ICU 2.4 */ U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, int32_t *length); /** * Gets the collator used for the language rules. *

* Deleting the returned UCollator before calling * usearch_close would cause the string search to fail. * usearch_close will delete the collator if this search owns it. * @param strsrch search iterator data struct * @return collator * @stable ICU 2.4 */ U_CAPI UCollator * U_EXPORT2 usearch_getCollator( const UStringSearch *strsrch); /** * Sets the collator used for the language rules. User retains the ownership * of this collator, thus the responsibility of deletion lies with the user. * This method causes internal data such as the pattern collation elements * and shift tables to be recalculated, but the iterator's position is unchanged. * @param strsrch search iterator data struct * @param collator to be used * @param status for errors if it occurs * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, const UCollator *collator, UErrorCode *status); /** * Sets the pattern used for matching. * Internal data like the pattern collation elements will be recalculated, but the * iterator's position is unchanged. * * The UStringSearch retains a pointer to the pattern string. The caller must not * modify or delete the string while using the UStringSearch. * * @param strsrch search iterator data struct * @param pattern string * @param patternlength pattern length, -1 for null-terminated string * @param status for errors if it occurs. If text is NULL, or textlength is 0 * then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change * done to strsrch. * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, const UChar *pattern, int32_t patternlength, UErrorCode *status); /** * Gets the search pattern * @param strsrch search iterator data struct * @param length return length of the pattern, -1 indicates that the pattern * is null-terminated * @return pattern string * @stable ICU 2.4 */ U_CAPI const UChar * U_EXPORT2 usearch_getPattern( const UStringSearch *strsrch, int32_t *length); /* methods ------------------------------------------------------------- */ /** * Returns the first index at which the string text matches the search * pattern. * The iterator is adjusted so that its current index (as returned by * usearch_getOffset) is the match position if one was found. * If a match is not found, USEARCH_DONE will be returned and * the iterator will be adjusted to the index USEARCH_DONE. * @param strsrch search iterator data struct * @param status for errors if it occurs * @return The character index of the first match, or * USEARCH_DONE if there are no matches. * @see #usearch_getOffset * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, UErrorCode *status); /** * Returns the first index equal or greater than position at which * the string text * matches the search pattern. The iterator is adjusted so that its current * index (as returned by usearch_getOffset) is the match position if * one was found. * If a match is not found, USEARCH_DONE will be returned and * the iterator will be adjusted to the index USEARCH_DONE *

* Search positions that may render incorrect results are highlighted in the * header comments. If position is less than or greater than the text range * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned * @param strsrch search iterator data struct * @param position to start the search at * @param status for errors if it occurs * @return The character index of the first match following pos, * or USEARCH_DONE if there are no matches. * @see #usearch_getOffset * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, int32_t position, UErrorCode *status); /** * Returns the last index in the target text at which it matches the search * pattern. The iterator is adjusted so that its current * index (as returned by usearch_getOffset) is the match position if * one was found. * If a match is not found, USEARCH_DONE will be returned and * the iterator will be adjusted to the index USEARCH_DONE. * @param strsrch search iterator data struct * @param status for errors if it occurs * @return The index of the first match, or USEARCH_DONE if there * are no matches. * @see #usearch_getOffset * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, UErrorCode *status); /** * Returns the first index less than position at which the string text * matches the search pattern. The iterator is adjusted so that its current * index (as returned by usearch_getOffset) is the match position if * one was found. * If a match is not found, USEARCH_DONE will be returned and * the iterator will be adjusted to the index USEARCH_DONE *

* Search positions that may render incorrect results are highlighted in the * header comments. If position is less than or greater than the text range * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned. *

* When USEARCH_OVERLAP option is off, the last index of the * result match is always less than position. * When USERARCH_OVERLAP is on, the result match may span across * position. * @param strsrch search iterator data struct * @param position index position the search is to begin at * @param status for errors if it occurs * @return The character index of the first match preceding pos, * or USEARCH_DONE if there are no matches. * @see #usearch_getOffset * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, int32_t position, UErrorCode *status); /** * Returns the index of the next point at which the string text matches the * search pattern, starting from the current position. * The iterator is adjusted so that its current * index (as returned by usearch_getOffset) is the match position if * one was found. * If a match is not found, USEARCH_DONE will be returned and * the iterator will be adjusted to the index USEARCH_DONE * @param strsrch search iterator data struct * @param status for errors if it occurs * @return The index of the next match after the current position, or * USEARCH_DONE if there are no more matches. * @see #usearch_first * @see #usearch_getOffset * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, UErrorCode *status); /** * Returns the index of the previous point at which the string text matches * the search pattern, starting at the current position. * The iterator is adjusted so that its current * index (as returned by usearch_getOffset) is the match position if * one was found. * If a match is not found, USEARCH_DONE will be returned and * the iterator will be adjusted to the index USEARCH_DONE * @param strsrch search iterator data struct * @param status for errors if it occurs * @return The index of the previous match before the current position, * or USEARCH_DONE if there are no more matches. * @see #usearch_last * @see #usearch_getOffset * @see #USEARCH_DONE * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, UErrorCode *status); /** * Reset the iteration. * Search will begin at the start of the text string if a forward iteration * is initiated before a backwards iteration. Otherwise if a backwards * iteration is initiated before a forwards iteration, the search will begin * at the end of the text string. * @param strsrch search iterator data struct * @see #usearch_first * @stable ICU 2.4 */ U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch); #endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */ #endif // uspoof.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* *************************************************************************** * Copyright (C) 2008-2016, International Business Machines Corporation * and others. All Rights Reserved. *************************************************************************** * file name: uspoof.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2008Feb13 * created by: Andy Heninger * * Unicode Spoof Detection */ #ifndef USPOOF_H #define USPOOF_H #if !UCONFIG_NO_NORMALIZATION /** * \file * \brief C API: Unicode Security and Spoofing Detection * *

* This class, based on Unicode Technical Report #36 and * Unicode Technical Standard #39, has two main functions: * *

    *
  1. Checking whether two strings are visually confusable with each other, such as "Harvest" and * "Ηarvest", where the second string starts with the Greek capital letter Eta.
  2. *
  3. Checking whether an individual string is likely to be an attempt at confusing the reader (spoof * detection), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.
  4. *
* *

* Although originally designed as a method for flagging suspicious identifier strings such as URLs, * USpoofChecker has a number of other practical use cases, such as preventing attempts to evade bad-word * content filters. * *

* The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++. * *

Confusables

* *

* The following example shows how to use USpoofChecker to check for confusability between two strings: * * \code{.c} * UErrorCode status = U_ZERO_ERROR; * UChar* str1 = (UChar*) u"Harvest"; * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA * * USpoofChecker* sc = uspoof_open(&status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); * * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status); * UBool result = bitmask != 0; * // areConfusable: 1 (status: U_ZERO_ERROR) * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); * uspoof_close(sc); * \endcode * *

* The call to {@link uspoof_open} creates a USpoofChecker object; the call to {@link uspoof_setChecks} * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the * confusability test; and the following line extracts the result out of the return value. For best performance, * the instance should be created once (e.g., upon application startup), and the efficient * {@link uspoof_areConfusable} method can be used at runtime. * *

* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call * {@link uspoof_close} when the object goes out of scope: * * \code{.cpp} * UErrorCode status = U_ZERO_ERROR; * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status); * // ... * \endcode * * UTS 39 defines two strings to be confusable if they map to the same skeleton string. A skeleton can * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so * the following snippet is equivalent to the example above: * * \code{.c} * UErrorCode status = U_ZERO_ERROR; * UChar* str1 = (UChar*) u"Harvest"; * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA * * USpoofChecker* sc = uspoof_open(&status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); * * // Get skeleton 1 * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status); * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar)); * status = U_ZERO_ERROR; * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status); * * // Get skeleton 2 * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status); * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar)); * status = U_ZERO_ERROR; * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status); * * // Are the skeletons the same? * UBool result = u_strcmp(skel1, skel2) == 0; * // areConfusable: 1 (status: U_ZERO_ERROR) * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); * uspoof_close(sc); * free(skel1); * free(skel2); * \endcode * * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below: * * \code{.c} * UErrorCode status = U_ZERO_ERROR; * #define DICTIONARY_LENGTH 2 * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; * UChar* skeletons[DICTIONARY_LENGTH]; * UChar* str = (UChar*) u"1orern"; * * // Setup: * USpoofChecker* sc = uspoof_open(&status); * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); * for (size_t i=0; iNote: Since the Unicode confusables mapping table is frequently updated, confusable skeletons are not * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. * *

Spoof Detection

* * The following snippet shows a minimal example of using USpoofChecker to perform spoof detection on a * string: * * \code{.c} * UErrorCode status = U_ZERO_ERROR; * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A * * // Get the default set of allowable characters: * USet* allowed = uset_openEmpty(); * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); * uset_addAll(allowed, uspoof_getInclusionSet(&status)); * * USpoofChecker* sc = uspoof_open(&status); * uspoof_setAllowedChars(sc, allowed, &status); * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); * * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status); * UBool result = bitmask != 0; * // fails checks: 1 (status: U_ZERO_ERROR) * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); * uspoof_close(sc); * uset_close(allowed); * \endcode * * As in the case for confusability checking, it is good practice to create one USpoofChecker instance at * startup, and call the cheaper {@link uspoof_check} online. We specify the set of * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. * * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings, * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers. * * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions * with a {@link USpoofCheckResult} parameter: * * \code{.c} * UErrorCode status = U_ZERO_ERROR; * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A * * // Get the default set of allowable characters: * USet* allowed = uset_openEmpty(); * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); * uset_addAll(allowed, uspoof_getInclusionSet(&status)); * * USpoofChecker* sc = uspoof_open(&status); * uspoof_setAllowedChars(sc, allowed, &status); * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); * * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status); * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status); * * int32_t failures1 = bitmask; * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status); * assert(failures1 == failures2); * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); * * // Cleanup: * uspoof_close(sc); * uset_close(allowed); * uspoof_closeCheckResult(checkResult); * \endcode * * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally * equivalent to the one above: * * \code{.cpp} * UErrorCode status = U_ZERO_ERROR; * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A * * // Get the default set of allowable characters: * UnicodeSet allowed; * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); * * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); * * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); * * int32_t failures1 = bitmask; * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status); * assert(failures1 == failures2); * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); * * // Explicit cleanup not necessary. * \endcode * * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: * *
    *
  • RESTRICTION_LEVEL: flags strings that violate the * Restriction Level test as specified in UTS * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.
  • *
  • INVISIBLE: flags strings that contain invisible characters, such as zero-width spaces, or character * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.
  • *
  • CHAR_LIMIT: flags strings that contain characters outside of a specified set of acceptable * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.
  • *
  • MIXED_NUMBERS: flags strings that contain digits from multiple different numbering systems.
  • *
* *

* These checks can be enabled independently of each other. For example, if you were interested in checking for only the * INVISIBLE and MIXED_NUMBERS conditions, you could do: * * \code{.c} * UErrorCode status = U_ZERO_ERROR; * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR * * USpoofChecker* sc = uspoof_open(&status); * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status); * * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status); * UBool result = bitmask != 0; * // fails checks: 1 (status: U_ZERO_ERROR) * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); * uspoof_close(sc); * \endcode * * Here is an example in C++ showing how to compute the restriction level of a string: * * \code{.cpp} * UErrorCode status = U_ZERO_ERROR; * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A * * // Get the default set of allowable characters: * UnicodeSet allowed; * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); * * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status); * * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); * * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status); * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask: * assert((restrictionLevel & bitmask) == restrictionLevel); * // Restriction level: 0x50000000 (status: U_ZERO_ERROR) * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status)); * \endcode * * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check. * * Note: The Restriction Level is the most powerful of the checks. The full logic is documented in * UTS 39, but the basic idea is that strings * are restricted to contain characters from only a single script, except that most scripts are allowed to have * Latin characters interspersed. Although the default restriction level is HIGHLY_RESTRICTIVE, it is * recommended that users set their restriction level to MODERATELY_RESTRICTIVE, which allows Latin mixed * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple * scripts. * *

Additional Information

* * A USpoofChecker instance may be used repeatedly to perform checks on any number of identifiers. * * Thread Safety: The test functions for checking a single identifier, or for testing whether * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads, * using the same USpoofChecker instance. * * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are * thread safe. Those that take a non-const USpoofChecker are not thread safe.. * * @stable ICU 4.6 */ U_CDECL_BEGIN struct USpoofChecker; /** * @stable ICU 4.2 */ typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */ struct USpoofCheckResult; /** * @see uspoof_openCheckResult * @stable ICU 58 */ typedef struct USpoofCheckResult USpoofCheckResult; /** * Enum for the kinds of checks that USpoofChecker can perform. * These enum values are used both to select the set of checks that * will be performed, and to report results from the check function. * * @stable ICU 4.2 */ typedef enum USpoofChecks { /** * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section * 4. * * @see uspoof_areConfusable * @stable ICU 4.2 */ USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1, /** * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates * that the two strings are visually confusable and that they are not from the same script, according to UTS * 39 section 4. * * @see uspoof_areConfusable * @stable ICU 4.2 */ USPOOF_MIXED_SCRIPT_CONFUSABLE = 2, /** * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates * that the two strings are visually confusable and that they are not from the same script but both of them are * single-script strings, according to UTS 39 section 4. * * @see uspoof_areConfusable * @stable ICU 4.2 */ USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4, /** * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to * make {@link uspoof_areConfusable} return only those types of confusables. * * @see uspoof_areConfusable * @see uspoof_getSkeleton * @stable ICU 58 */ USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, /** * Check that an identifier is no looser than the specified RestrictionLevel. * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE. * * If USPOOF_AUX_INFO is enabled the actual restriction level of the * identifier being tested will also be returned by uspoof_check(). * * @see URestrictionLevel * @see uspoof_setRestrictionLevel * @see USPOOF_AUX_INFO * * @stable ICU 51 */ USPOOF_RESTRICTION_LEVEL = 16, /** Check an identifier for the presence of invisible characters, * such as zero-width spaces, or character sequences that are * likely not to display, such as multiple occurrences of the same * non-spacing mark. This check does not test the input string as a whole * for conformance to any particular syntax for identifiers. */ USPOOF_INVISIBLE = 32, /** Check that an identifier contains only characters from a specified set * of acceptable characters. See {@link uspoof_setAllowedChars} and * {@link uspoof_setAllowedLocales}. Note that a string that fails this check * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check. */ USPOOF_CHAR_LIMIT = 64, /** * Check that an identifier does not mix numbers from different numbering systems. * For more information, see UTS 39 section 5.3. * * @stable ICU 51 */ USPOOF_MIXED_NUMBERS = 128, #if (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Check that an identifier does not have a combining character following a character in which that * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. * * More specifically, the following characters are forbidden from preceding a U+0307: *
    *
  • Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')
  • *
  • Latin lowercase letter 'l'
  • *
  • Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)
  • *
  • Any character whose confusable prototype ends with such a character * (Soft_Dotted, 'l', 'ı', or 'ȷ')
  • *
* In addition, combining characters are allowed between the above characters and U+0307 except those * with combining class 0 or combining class "Above" (230, same class as U+0307). * * This list and the number of combing characters considered by this check may grow over time. * * @stable ICU 62 */ USPOOF_HIDDEN_OVERLAY = 256, #endif // (NTDDI_VERSION >= NTDDI_WIN10_CO) /** * Enable all spoof checks. * * @stable ICU 4.6 */ USPOOF_ALL_CHECKS = 0xFFFF, /** * Enable the return of auxillary (non-error) information in the * upper bits of the check results value. * * If this "check" is not enabled, the results of {@link uspoof_check} will be * zero when an identifier passes all of the enabled checks. * * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will * be zero when an identifier passes all checks. * * @stable ICU 51 */ USPOOF_AUX_INFO = 0x40000000 } USpoofChecks; /** * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and * for returned identifier restriction levels in check results. * * @stable ICU 51 * * @see uspoof_setRestrictionLevel * @see uspoof_check */ typedef enum URestrictionLevel { /** * All characters in the string are in the identifier profile and all characters in the string are in the * ASCII range. * * @stable ICU 51 */ USPOOF_ASCII = 0x10000000, /** * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and * the string is single-script, according to the definition in UTS 39 section 5.1. * * @stable ICU 53 */ USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000, /** * The string classifies as Single Script, or all characters in the string are in the identifier profile and * the string is covered by any of the following sets of scripts, according to the definition in UTS 39 * section 5.1: *
    *
  • Latin + Han + Bopomofo (or equivalently: Latn + Hanb)
  • *
  • Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)
  • *
  • Latin + Han + Hangul (or equivalently: Latn +Kore)
  • *
* This is the default restriction in ICU. * * @stable ICU 51 */ USPOOF_HIGHLY_RESTRICTIVE = 0x30000000, /** * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, * Greek, and Cherokee. * * @stable ICU 51 */ USPOOF_MODERATELY_RESTRICTIVE = 0x40000000, /** * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts. * * @stable ICU 51 */ USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000, /** * Any valid identifiers, including characters outside of the Identifier Profile. * * @stable ICU 51 */ USPOOF_UNRESTRICTIVE = 0x60000000, /** * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}. * * @stable ICU 53 */ USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000, } URestrictionLevel; /** * Create a Unicode Spoof Checker, configured to perform all * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. * Note that additional checks may be added in the future, * resulting in the changes to the default checking behavior. * * @param status The error code, set if this function encounters a problem. * @return the newly created Spoof Checker * @stable ICU 4.2 */ U_CAPI USpoofChecker * U_EXPORT2 uspoof_open(UErrorCode *status); /** * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory. * Inverse of uspoof_serialize(). * The memory containing the serialized data must remain valid and unchanged * as long as the spoof checker, or any cloned copies of the spoof checker, * are in use. Ownership of the memory remains with the caller. * The spoof checker (and any clones) must be closed prior to deleting the * serialized data. * * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data * @param length the number of bytes available at data; * can be more than necessary * @param pActualLength receives the actual number of bytes at data taken up by the data; * can be NULL * @param pErrorCode ICU error code * @return the spoof checker. * * @see uspoof_open * @see uspoof_serialize * @stable ICU 4.2 */ U_CAPI USpoofChecker * U_EXPORT2 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, UErrorCode *pErrorCode); /** * Open a Spoof Checker from the source form of the spoof data. * The input corresponds to the Unicode data file confusables.txt * as described in Unicode UAX #39. The syntax of the source data * is as described in UAX #39 for this file, and the content of * this file is acceptable input. * * The character encoding of the (char *) input text is UTF-8. * * @param confusables a pointer to the confusable characters definitions, * as found in file confusables.txt from unicode.org. * @param confusablesLen The length of the confusables text, or -1 if the * input string is zero terminated. * @param confusablesWholeScript * Deprecated in ICU 58. No longer used. * @param confusablesWholeScriptLen * Deprecated in ICU 58. No longer used. * @param errType In the event of an error in the input, indicates * which of the input files contains the error. * The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or * USPOOF_WHOLE_SCRIPT_CONFUSABLE, or * zero if no errors are found. * @param pe In the event of an error in the input, receives the position * in the input text (line, offset) of the error. * @param status an in/out ICU UErrorCode. Among the possible errors is * U_PARSE_ERROR, which is used to report syntax errors * in the input. * @return A spoof checker that uses the rules from the input files. * @stable ICU 4.2 */ U_CAPI USpoofChecker * U_EXPORT2 uspoof_openFromSource(const char *confusables, int32_t confusablesLen, const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, int32_t *errType, UParseError *pe, UErrorCode *status); /** * Close a Spoof Checker, freeing any memory that was being held by * its implementation. * @stable ICU 4.2 */ U_CAPI void U_EXPORT2 uspoof_close(USpoofChecker *sc); /** * Clone a Spoof Checker. The clone will be set to perform the same checks * as the original source. * * @param sc The source USpoofChecker * @param status The error code, set if this function encounters a problem. * @return * @stable ICU 4.2 */ U_CAPI USpoofChecker * U_EXPORT2 uspoof_clone(const USpoofChecker *sc, UErrorCode *status); /** * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method * overwrites any checks that may have already been enabled. By default, all checks are enabled. * * To enable specific checks and disable all others, * OR together only the bit constants for the desired checks. * For example, to fail strings containing characters outside of * the set specified by {@link uspoof_setAllowedChars} and * also strings that contain digits from mixed numbering systems: * *
 * {@code
 * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS);
 * }
 * 
* * To disable specific checks and enable all others, * start with ALL_CHECKS and "AND away" the not-desired checks. * For example, if you are not planning to use the {@link uspoof_areConfusable} functionality, * it is good practice to disable the CONFUSABLE check: * *
 * {@code
 * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE);
 * }
 * 
* * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they * enable onto the existing bitmask specified by this method. For more details, see the documentation of those * methods. * * @param sc The USpoofChecker * @param checks The set of checks that this spoof checker will perform. * The value is a bit set, obtained by OR-ing together * values from enum USpoofChecks. * @param status The error code, set if this function encounters a problem. * @stable ICU 4.2 * */ U_CAPI void U_EXPORT2 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status); /** * Get the set of checks that this Spoof Checker has been configured to perform. * * @param sc The USpoofChecker * @param status The error code, set if this function encounters a problem. * @return The set of checks that this spoof checker will perform. * The value is a bit set, obtained by OR-ing together * values from enum USpoofChecks. * @stable ICU 4.2 * */ U_CAPI int32_t U_EXPORT2 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); /** * Set the loosest restriction level allowed for strings. The default if this is not called is * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. * * @param sc The USpoofChecker * @param restrictionLevel The loosest restriction level allowed. * @see URestrictionLevel * @stable ICU 51 */ U_CAPI void U_EXPORT2 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); /** * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}. * * @return The restriction level * @see URestrictionLevel * @stable ICU 51 */ U_CAPI URestrictionLevel U_EXPORT2 uspoof_getRestrictionLevel(const USpoofChecker *sc); /** * Limit characters that are acceptable in identifiers being checked to those * normally used with the languages associated with the specified locales. * Any previously specified list of locales is replaced by the new settings. * * A set of languages is determined from the locale(s), and * from those a set of acceptable Unicode scripts is determined. * Characters from this set of scripts, along with characters from * the "common" and "inherited" Unicode Script categories * will be permitted. * * Supplying an empty string removes all restrictions; * characters from any script will be allowed. * * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this * USpoofChecker when calling this function with a non-empty list * of locales. * * The Unicode Set of characters that will be allowed is accessible * via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales() * will replace any previously applied set of allowed characters. * * Adjustments, such as additions or deletions of certain classes of characters, * can be made to the result of uspoof_setAllowedLocales() by * fetching the resulting set with uspoof_getAllowedChars(), * manipulating it with the Unicode Set API, then resetting the * spoof detectors limits with uspoof_setAllowedChars(). * * @param sc The USpoofChecker * @param localesList A list list of locales, from which the language * and associated script are extracted. The locales * are comma-separated if there is more than one. * White space may not appear within an individual locale, * but is ignored otherwise. * The locales are syntactically like those from the * HTTP Accept-Language header. * If the localesList is empty, no restrictions will be placed on * the allowed characters. * * @param status The error code, set if this function encounters a problem. * @stable ICU 4.2 */ U_CAPI void U_EXPORT2 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status); /** * Get a list of locales for the scripts that are acceptable in strings * to be checked. If no limitations on scripts have been specified, * an empty string will be returned. * * uspoof_setAllowedChars() will reset the list of allowed to be empty. * * The format of the returned list is the same as that supplied to * uspoof_setAllowedLocales(), but returned list may not be identical * to the originally specified string; the string may be reformatted, * and information other than languages from * the originally specified locales may be omitted. * * @param sc The USpoofChecker * @param status The error code, set if this function encounters a problem. * @return A string containing a list of locales corresponding * to the acceptable scripts, formatted like an * HTTP Accept Language value. * * @stable ICU 4.2 */ U_CAPI const char * U_EXPORT2 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status); /** * Limit the acceptable characters to those specified by a Unicode Set. * Any previously specified character limit is * is replaced by the new settings. This includes limits on * characters that were set with the uspoof_setAllowedLocales() function. * * The USPOOF_CHAR_LIMIT test is automatically enabled for this * USpoofChecker by this function. * * @param sc The USpoofChecker * @param chars A Unicode Set containing the list of * characters that are permitted. Ownership of the set * remains with the caller. The incoming set is cloned by * this function, so there are no restrictions on modifying * or deleting the USet after calling this function. * @param status The error code, set if this function encounters a problem. * @stable ICU 4.2 */ U_CAPI void U_EXPORT2 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status); /** * Get a USet for the characters permitted in an identifier. * This corresponds to the limits imposed by the Set Allowed Characters * functions. Limitations imposed by other checks will not be * reflected in the set returned by this function. * * The returned set will be frozen, meaning that it cannot be modified * by the caller. * * Ownership of the returned set remains with the Spoof Detector. The * returned set will become invalid if the spoof detector is closed, * or if a new set of allowed characters is specified. * * * @param sc The USpoofChecker * @param status The error code, set if this function encounters a problem. * @return A USet containing the characters that are permitted by * the USPOOF_CHAR_LIMIT test. * @stable ICU 4.2 */ U_CAPI const USet * U_EXPORT2 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status); /** * Check the specified string for possible security issues. * The text to be checked will typically be an identifier of some sort. * The set of checks to be performed is specified with uspoof_setChecks(). * * \note * Consider using the newer API, {@link uspoof_check2}, instead. * The newer API exposes additional information from the check procedure * and is otherwise identical to this method. * * @param sc The USpoofChecker * @param id The identifier to be checked for possible security issues, * in UTF-16 format. * @param length the length of the string to be checked, expressed in * 16 bit UTF-16 code units, or -1 if the string is * zero terminated. * @param position Deprecated in ICU 51. Always returns zero. * Originally, an out parameter for the index of the first * string position that failed a check. * This parameter may be NULL. * @param status The error code, set if an error occurred while attempting to * perform the check. * Spoofing or security issues detected with the input string are * not reported here, but through the function's return value. * @return An integer value with bits set for any potential security * or spoofing issues detected. The bits are defined by * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) * will be zero if the input string passes all of the * enabled checks. * @see uspoof_check2 * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 uspoof_check(const USpoofChecker *sc, const UChar *id, int32_t length, int32_t *position, UErrorCode *status); /** * Check the specified string for possible security issues. * The text to be checked will typically be an identifier of some sort. * The set of checks to be performed is specified with uspoof_setChecks(). * * \note * Consider using the newer API, {@link uspoof_check2UTF8}, instead. * The newer API exposes additional information from the check procedure * and is otherwise identical to this method. * * @param sc The USpoofChecker * @param id A identifier to be checked for possible security issues, in UTF8 format. * @param length the length of the string to be checked, or -1 if the string is * zero terminated. * @param position Deprecated in ICU 51. Always returns zero. * Originally, an out parameter for the index of the first * string position that failed a check. * This parameter may be NULL. * @param status The error code, set if an error occurred while attempting to * perform the check. * Spoofing or security issues detected with the input string are * not reported here, but through the function's return value. * If the input contains invalid UTF-8 sequences, * a status of U_INVALID_CHAR_FOUND will be returned. * @return An integer value with bits set for any potential security * or spoofing issues detected. The bits are defined by * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) * will be zero if the input string passes all of the * enabled checks. * @see uspoof_check2UTF8 * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 uspoof_checkUTF8(const USpoofChecker *sc, const char *id, int32_t length, int32_t *position, UErrorCode *status); #if (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Check the specified string for possible security issues. * The text to be checked will typically be an identifier of some sort. * The set of checks to be performed is specified with uspoof_setChecks(). * * @param sc The USpoofChecker * @param id The identifier to be checked for possible security issues, * in UTF-16 format. * @param length the length of the string to be checked, or -1 if the string is * zero terminated. * @param checkResult An instance of USpoofCheckResult to be filled with * details about the identifier. Can be NULL. * @param status The error code, set if an error occurred while attempting to * perform the check. * Spoofing or security issues detected with the input string are * not reported here, but through the function's return value. * @return An integer value with bits set for any potential security * or spoofing issues detected. The bits are defined by * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) * will be zero if the input string passes all of the * enabled checks. Any information in this bitmask will be * consistent with the information saved in the optional * checkResult parameter. * @see uspoof_openCheckResult * @see uspoof_check2UTF8 * @see uspoof_check2UnicodeString * @stable ICU 58 */ U_CAPI int32_t U_EXPORT2 uspoof_check2(const USpoofChecker *sc, const UChar* id, int32_t length, USpoofCheckResult* checkResult, UErrorCode *status); /** * Check the specified string for possible security issues. * The text to be checked will typically be an identifier of some sort. * The set of checks to be performed is specified with uspoof_setChecks(). * * This version of {@link uspoof_check} accepts a USpoofCheckResult, which * returns additional information about the identifier. For more * information, see {@link uspoof_openCheckResult}. * * @param sc The USpoofChecker * @param id A identifier to be checked for possible security issues, in UTF8 format. * @param length the length of the string to be checked, or -1 if the string is * zero terminated. * @param checkResult An instance of USpoofCheckResult to be filled with * details about the identifier. Can be NULL. * @param status The error code, set if an error occurred while attempting to * perform the check. * Spoofing or security issues detected with the input string are * not reported here, but through the function's return value. * @return An integer value with bits set for any potential security * or spoofing issues detected. The bits are defined by * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) * will be zero if the input string passes all of the * enabled checks. Any information in this bitmask will be * consistent with the information saved in the optional * checkResult parameter. * @see uspoof_openCheckResult * @see uspoof_check2 * @see uspoof_check2UnicodeString * @stable ICU 58 */ U_CAPI int32_t U_EXPORT2 uspoof_check2UTF8(const USpoofChecker *sc, const char *id, int32_t length, USpoofCheckResult* checkResult, UErrorCode *status); /** * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return * information about the identifier. Information includes: *
    *
  • A bitmask of the checks that failed
  • *
  • The identifier's restriction level (UTS 39 section 5.2)
  • *
  • The set of numerics in the string (UTS 39 section 5.3)
  • *
* The data held in a USpoofCheckResult is cleared whenever it is passed into a new call * of {@link uspoof_check2}. * * @param status The error code, set if this function encounters a problem. * @return the newly created USpoofCheckResult * @see uspoof_check2 * @see uspoof_check2UTF8 * @see uspoof_check2UnicodeString * @stable ICU 58 */ U_CAPI USpoofCheckResult* U_EXPORT2 uspoof_openCheckResult(UErrorCode *status); /** * Close a USpoofCheckResult, freeing any memory that was being held by * its implementation. * * @param checkResult The instance of USpoofCheckResult to close * @stable ICU 58 */ U_CAPI void U_EXPORT2 uspoof_closeCheckResult(USpoofCheckResult *checkResult); /** * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on. * * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} * @param status The error code, set if an error occurred. * @return An integer value with bits set for any potential security * or spoofing issues detected. The bits are defined by * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) * will be zero if the input string passes all of the * enabled checks. * @see uspoof_setChecks * @stable ICU 58 */ U_CAPI int32_t U_EXPORT2 uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status); /** * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check * was enabled; otherwise, undefined. * * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} * @param status The error code, set if an error occurred. * @return The restriction level contained in the USpoofCheckResult * @see uspoof_setRestrictionLevel * @stable ICU 58 */ U_CAPI URestrictionLevel U_EXPORT2 uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status); /** * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled; * otherwise, undefined. The set will contain the zero digit from each decimal number system found * in the input string. Ownership of the returned USet remains with the USpoofCheckResult. * The USet will be free'd when {@link uspoof_closeCheckResult} is called. * * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} * @return The set of numerics contained in the USpoofCheckResult * @param status The error code, set if an error occurred. * @stable ICU 58 */ U_CAPI const USet* U_EXPORT2 uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Check the whether two specified strings are visually confusable. * * If the strings are confusable, the return value will be nonzero, as long as * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). * * The bits in the return value correspond to flags for each of the classes of * confusables applicable to the two input strings. According to UTS 39 * section 4, the possible flags are: * *
    *
  • {@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}
  • *
  • {@link USPOOF_MIXED_SCRIPT_CONFUSABLE}
  • *
  • {@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}
  • *
* * If one or more of the above flags were not listed in uspoof_setChecks(), this * function will never report that class of confusable. The check * {@link USPOOF_CONFUSABLE} enables all three flags. * * * @param sc The USpoofChecker * @param id1 The first of the two identifiers to be compared for * confusability. The strings are in UTF-16 format. * @param length1 the length of the first identifier, expressed in * 16 bit UTF-16 code units, or -1 if the string is * nul terminated. * @param id2 The second of the two identifiers to be compared for * confusability. The identifiers are in UTF-16 format. * @param length2 The length of the second identifiers, expressed in * 16 bit UTF-16 code units, or -1 if the string is * nul terminated. * @param status The error code, set if an error occurred while attempting to * perform the check. * Confusability of the identifiers is not reported here, * but through this function's return value. * @return An integer value with bit(s) set corresponding to * the type of confusability found, as defined by * enum USpoofChecks. Zero is returned if the identifiers * are not confusable. * * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 uspoof_areConfusable(const USpoofChecker *sc, const UChar *id1, int32_t length1, const UChar *id2, int32_t length2, UErrorCode *status); /** * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format. * * @param sc The USpoofChecker * @param id1 The first of the two identifiers to be compared for * confusability. The strings are in UTF-8 format. * @param length1 the length of the first identifiers, in bytes, or -1 * if the string is nul terminated. * @param id2 The second of the two identifiers to be compared for * confusability. The strings are in UTF-8 format. * @param length2 The length of the second string in bytes, or -1 * if the string is nul terminated. * @param status The error code, set if an error occurred while attempting to * perform the check. * Confusability of the strings is not reported here, * but through this function's return value. * @return An integer value with bit(s) set corresponding to * the type of confusability found, as defined by * enum USpoofChecks. Zero is returned if the strings * are not confusable. * * @stable ICU 4.2 * * @see uspoof_areConfusable */ U_CAPI int32_t U_EXPORT2 uspoof_areConfusableUTF8(const USpoofChecker *sc, const char *id1, int32_t length1, const char *id2, int32_t length2, UErrorCode *status); /** * Get the "skeleton" for an identifier. * Skeletons are a transformation of the input identifier; * Two identifiers are confusable if their skeletons are identical. * See Unicode UAX #39 for additional information. * * Using skeletons directly makes it possible to quickly check * whether an identifier is confusable with any of some large * set of existing identifiers, by creating an efficiently * searchable collection of the skeletons. * * @param sc The USpoofChecker * @param type Deprecated in ICU 58. You may pass any number. * Originally, controlled which of the Unicode confusable data * tables to use. * @param id The input identifier whose skeleton will be computed. * @param length The length of the input identifier, expressed in 16 bit * UTF-16 code units, or -1 if the string is zero terminated. * @param dest The output buffer, to receive the skeleton string. * @param destCapacity The length of the output buffer, in 16 bit units. * The destCapacity may be zero, in which case the function will * return the actual length of the skeleton. * @param status The error code, set if an error occurred while attempting to * perform the check. * @return The length of the skeleton string. The returned length * is always that of the complete skeleton, even when the * supplied buffer is too small (or of zero length) * * @stable ICU 4.2 * @see uspoof_areConfusable */ U_CAPI int32_t U_EXPORT2 uspoof_getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *id, int32_t length, UChar *dest, int32_t destCapacity, UErrorCode *status); /** * Get the "skeleton" for an identifier. * Skeletons are a transformation of the input identifier; * Two identifiers are confusable if their skeletons are identical. * See Unicode UAX #39 for additional information. * * Using skeletons directly makes it possible to quickly check * whether an identifier is confusable with any of some large * set of existing identifiers, by creating an efficiently * searchable collection of the skeletons. * * @param sc The USpoofChecker * @param type Deprecated in ICU 58. You may pass any number. * Originally, controlled which of the Unicode confusable data * tables to use. * @param id The UTF-8 format identifier whose skeleton will be computed. * @param length The length of the input string, in bytes, * or -1 if the string is zero terminated. * @param dest The output buffer, to receive the skeleton string. * @param destCapacity The length of the output buffer, in bytes. * The destCapacity may be zero, in which case the function will * return the actual length of the skeleton. * @param status The error code, set if an error occurred while attempting to * perform the check. Possible Errors include U_INVALID_CHAR_FOUND * for invalid UTF-8 sequences, and * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small * to hold the complete skeleton. * @return The length of the skeleton string, in bytes. The returned length * is always that of the complete skeleton, even when the * supplied buffer is too small (or of zero length) * * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 uspoof_getSkeletonUTF8(const USpoofChecker *sc, uint32_t type, const char *id, int32_t length, char *dest, int32_t destCapacity, UErrorCode *status); /** * Get the set of Candidate Characters for Inclusion in Identifiers, as defined * in http://unicode.org/Public/security/latest/xidmodifications.txt * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. * * The returned set is frozen. Ownership of the set remains with the ICU library; it must not * be deleted by the caller. * * @param status The error code, set if a problem occurs while creating the set. * * @stable ICU 51 */ U_CAPI const USet * U_EXPORT2 uspoof_getInclusionSet(UErrorCode *status); /** * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined * in http://unicode.org/Public/security/latest/xidmodifications.txt * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. * * The returned set is frozen. Ownership of the set remains with the ICU library; it must not * be deleted by the caller. * * @param status The error code, set if a problem occurs while creating the set. * * @stable ICU 51 */ U_CAPI const USet * U_EXPORT2 uspoof_getRecommendedSet(UErrorCode *status); /** * Serialize the data for a spoof detector into a chunk of memory. * The flattened spoof detection tables can later be used to efficiently * instantiate a new Spoof Detector. * * The serialized spoof checker includes only the data compiled from the * Unicode data tables by uspoof_openFromSource(); it does not include * include any other state or configuration that may have been set. * * @param sc the Spoof Detector whose data is to be serialized. * @param data a pointer to 32-bit-aligned memory to be filled with the data, * can be NULL if capacity==0 * @param capacity the number of bytes available at data, * or 0 for preflighting * @param status an in/out ICU UErrorCode; possible errors include: * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization * - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad * @return the number of bytes written or needed for the spoof data * * @see utrie2_openFromSerialized() * @stable ICU 4.2 */ U_CAPI int32_t U_EXPORT2 uspoof_serialize(USpoofChecker *sc, void *data, int32_t capacity, UErrorCode *status); U_CDECL_END #endif /* UCONFIG_NO_NORMALIZATION */ #endif /* USPOOF_H */ // utmscale.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2004 - 2008, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #ifndef UTMSCALE_H #define UTMSCALE_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Universal Time Scale * * There are quite a few different conventions for binary datetime, depending on different * platforms and protocols. Some of these have severe drawbacks. For example, people using * Unix time (seconds since Jan 1, 1970) think that they are safe until near the year 2038. * But cases can and do arise where arithmetic manipulations causes serious problems. Consider * the computation of the average of two datetimes, for example: if one calculates them with * averageTime = (time1 + time2)/2, there will be overflow even with dates * around the present. Moreover, even if these problems don't occur, there is the issue of * conversion back and forth between different systems. * *

* Binary datetimes differ in a number of ways: the datatype, the unit, * and the epoch (origin). We'll refer to these as time scales. For example: * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Table 1: Binary Time Scales
SourceDatatypeUnitEpoch
UDTS_JAVA_TIMEint64_tmillisecondsJan 1, 1970
UDTS_UNIX_TIMEint32_t or int64_tsecondsJan 1, 1970
UDTS_ICU4C_TIMEdoublemillisecondsJan 1, 1970
UDTS_WINDOWS_FILE_TIMEint64_tticks (100 nanoseconds)Jan 1, 1601
UDTS_DOTNET_DATE_TIMEint64_tticks (100 nanoseconds)Jan 1, 0001
UDTS_MAC_OLD_TIMEint32_t or int64_tsecondsJan 1, 1904
UDTS_MAC_TIMEdoublesecondsJan 1, 2001
UDTS_EXCEL_TIME?daysDec 31, 1899
UDTS_DB2_TIME?daysDec 31, 1899
UDTS_UNIX_MICROSECONDS_TIMEint64_tmicrosecondsJan 1, 1970
* *

* All of the epochs start at 00:00 am (the earliest possible time on the day in question), * and are assumed to be UTC. * *

* The ranges for different datatypes are given in the following table (all values in years). * The range of years includes the entire range expressible with positive and negative * values of the datatype. The range of years for double is the range that would be allowed * without losing precision to the corresponding unit. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Unitsint64_tdoubleint32_t
1 sec5.84542x1011285,420,920.94136.10
1 millisecond584,542,046.09285,420.920.14
1 microsecond584,542.05285.420.00
100 nanoseconds (tick)58,454.2028.540.00
1 nanosecond584.54204610.28540.00
* *

* These functions implement a universal time scale which can be used as a 'pivot', * and provide conversion functions to and from all other major time scales. * This datetimes to be converted to the pivot time, safely manipulated, * and converted back to any other datetime time scale. * *

* So what to use for this pivot? Java time has plenty of range, but cannot represent * .NET System.DateTime values without severe loss of precision. ICU4C time addresses this by using a * double that is otherwise equivalent to the Java time. However, there are disadvantages * with doubles. They provide for much more graceful degradation in arithmetic operations. * But they only have 53 bits of accuracy, which means that they will lose precision when * converting back and forth to ticks. What would really be nice would be a * long double (80 bits -- 64 bit mantissa), but that is not supported on most systems. * *

* The Unix extended time uses a structure with two components: time in seconds and a * fractional field (microseconds). However, this is clumsy, slow, and * prone to error (you always have to keep track of overflow and underflow in the * fractional field). BigDecimal would allow for arbitrary precision and arbitrary range, * but we do not want to use this as the normal type, because it is slow and does not * have a fixed size. * *

* Because of these issues, we ended up concluding that the .NET framework's * System.DateTime would be the best pivot. However, we use the full range * allowed by the datatype, allowing for datetimes back to 29,000 BC and up to 29,000 AD. * This time scale is very fine grained, does not lose precision, and covers a range that * will meet almost all requirements. It will not handle the range that Java times do, * but frankly, being able to handle dates before 29,000 BC or after 29,000 AD is of very limited interest. * */ /** * UDateTimeScale values are used to specify the time scale used for * conversion into or out if the universal time scale. * * @stable ICU 3.2 */ typedef enum UDateTimeScale { /** * Used in the JDK. Data is a Java long (int64_t). Value * is milliseconds since January 1, 1970. * * @stable ICU 3.2 */ UDTS_JAVA_TIME = 0, /** * Used on Unix systems. Data is int32_t or int64_t. Value * is seconds since January 1, 1970. * * @stable ICU 3.2 */ UDTS_UNIX_TIME, /** * Used in IUC4C. Data is a double. Value * is milliseconds since January 1, 1970. * * @stable ICU 3.2 */ UDTS_ICU4C_TIME, /** * Used in Windows for file times. Data is an int64_t. Value * is ticks (1 tick == 100 nanoseconds) since January 1, 1601. * * @stable ICU 3.2 */ UDTS_WINDOWS_FILE_TIME, /** * Used in the .NET framework's System.DateTime structure. Data is an int64_t. Value * is ticks (1 tick == 100 nanoseconds) since January 1, 0001. * * @stable ICU 3.2 */ UDTS_DOTNET_DATE_TIME, /** * Used in older Macintosh systems. Data is int32_t or int64_t. Value * is seconds since January 1, 1904. * * @stable ICU 3.2 */ UDTS_MAC_OLD_TIME, /** * Used in newer Macintosh systems. Data is a double. Value * is seconds since January 1, 2001. * * @stable ICU 3.2 */ UDTS_MAC_TIME, /** * Used in Excel. Data is an ?unknown?. Value * is days since December 31, 1899. * * @stable ICU 3.2 */ UDTS_EXCEL_TIME, /** * Used in DB2. Data is an ?unknown?. Value * is days since December 31, 1899. * * @stable ICU 3.2 */ UDTS_DB2_TIME, /** * Data is a long. Value is microseconds since January 1, 1970. * Similar to Unix time (linear value from 1970) and struct timeval * (microseconds resolution). * * @stable ICU 3.8 */ UDTS_UNIX_MICROSECONDS_TIME, } UDateTimeScale; /** * UTimeScaleValue values are used to specify the time scale values * to utmscale_getTimeScaleValue. * * @see utmscale_getTimeScaleValue * * @stable ICU 3.2 */ typedef enum UTimeScaleValue { /** * The constant used to select the units vale * for a time scale. * * @see utmscale_getTimeScaleValue * * @stable ICU 3.2 */ UTSV_UNITS_VALUE = 0, /** * The constant used to select the epoch offset value * for a time scale. * * @see utmscale_getTimeScaleValue * * @stable ICU 3.2 */ UTSV_EPOCH_OFFSET_VALUE=1, /** * The constant used to select the minimum from value * for a time scale. * * @see utmscale_getTimeScaleValue * * @stable ICU 3.2 */ UTSV_FROM_MIN_VALUE=2, /** * The constant used to select the maximum from value * for a time scale. * * @see utmscale_getTimeScaleValue * * @stable ICU 3.2 */ UTSV_FROM_MAX_VALUE=3, /** * The constant used to select the minimum to value * for a time scale. * * @see utmscale_getTimeScaleValue * * @stable ICU 3.2 */ UTSV_TO_MIN_VALUE=4, /** * The constant used to select the maximum to value * for a time scale. * * @see utmscale_getTimeScaleValue * * @stable ICU 3.2 */ UTSV_TO_MAX_VALUE=5, } UTimeScaleValue; /** * Get a value associated with a particular time scale. * * @param timeScale The time scale * @param value A constant representing the value to get * @param status The status code. Set to U_ILLEGAL_ARGUMENT_ERROR if arguments are invalid. * @return - the value. * * @stable ICU 3.2 */ U_CAPI int64_t U_EXPORT2 utmscale_getTimeScaleValue(UDateTimeScale timeScale, UTimeScaleValue value, UErrorCode *status); /* Conversion to 'universal time scale' */ /** * Convert a int64_t datetime from the given time scale to the universal time scale. * * @param otherTime The int64_t datetime * @param timeScale The time scale to convert from * @param status The status code. Set to U_ILLEGAL_ARGUMENT_ERROR if the conversion is out of range. * * @return The datetime converted to the universal time scale * * @stable ICU 3.2 */ U_CAPI int64_t U_EXPORT2 utmscale_fromInt64(int64_t otherTime, UDateTimeScale timeScale, UErrorCode *status); /* Conversion from 'universal time scale' */ /** * Convert a datetime from the universal time scale to a int64_t in the given time scale. * * @param universalTime The datetime in the universal time scale * @param timeScale The time scale to convert to * @param status The status code. Set to U_ILLEGAL_ARGUMENT_ERROR if the conversion is out of range. * * @return The datetime converted to the given time scale * * @stable ICU 3.2 */ U_CAPI int64_t U_EXPORT2 utmscale_toInt64(int64_t universalTime, UDateTimeScale timeScale, UErrorCode *status); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // utrans.h // Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2011,2014-2015 International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * Date Name Description * 06/21/00 aliu Creation. ******************************************************************************* */ #ifndef UTRANS_H #define UTRANS_H #if !UCONFIG_NO_TRANSLITERATION /******************************************************************** * General Notes ******************************************************************** */ /** * \file * \brief C API: Transliterator * *

Transliteration

* The data structures and functions described in this header provide * transliteration services. Transliteration services are implemented * as C++ classes. The comments and documentation in this header * assume the reader is familiar with the C++ headers translit.h and * associated documentation. * * A significant but incomplete subset of the C++ transliteration * services are available to C code through this header. In order to * access more complex transliteration services, refer to the C++ * headers and documentation. * * There are two sets of functions for working with transliterator IDs: * * An old, deprecated set uses char * IDs, which works for true and pure * identifiers that these APIs were designed for, * for example "Cyrillic-Latin". * It does not work when the ID contains filters ("[:Script=Cyrl:]") * or even a complete set of rules because then the ID string contains more * than just "invariant" characters (see utypes.h). * * A new set of functions replaces the old ones and uses UChar * IDs, * paralleling the UnicodeString IDs in the C++ API. (New in ICU 2.8.) */ /******************************************************************** * Data Structures ********************************************************************/ /** * An opaque transliterator for use in C. Open with utrans_openxxx() * and close with utrans_close() when done. Equivalent to the C++ class * Transliterator and its subclasses. * @see Transliterator * @stable ICU 2.0 */ typedef void* UTransliterator; /** * Direction constant indicating the direction in a transliterator, * e.g., the forward or reverse rules of a RuleBasedTransliterator. * Specified when a transliterator is opened. An "A-B" transliterator * transliterates A to B when operating in the forward direction, and * B to A when operating in the reverse direction. * @stable ICU 2.0 */ typedef enum UTransDirection { /** * UTRANS_FORWARD means from <source> to <target> for a * transliterator with ID <source>-<target>. For a transliterator * opened using a rule, it means forward direction rules, e.g., * "A > B". */ UTRANS_FORWARD, /** * UTRANS_REVERSE means from <target> to <source> for a * transliterator with ID <source>-<target>. For a transliterator * opened using a rule, it means reverse direction rules, e.g., * "A < B". */ UTRANS_REVERSE } UTransDirection; /** * Position structure for utrans_transIncremental() incremental * transliteration. This structure defines two substrings of the text * being transliterated. The first region, [contextStart, * contextLimit), defines what characters the transliterator will read * as context. The second region, [start, limit), defines what * characters will actually be transliterated. The second region * should be a subset of the first. * *

After a transliteration operation, some of the indices in this * structure will be modified. See the field descriptions for * details. * *

contextStart <= start <= limit <= contextLimit * *

Note: All index values in this structure must be at code point * boundaries. That is, none of them may occur between two code units * of a surrogate pair. If any index does split a surrogate pair, * results are unspecified. * * @stable ICU 2.0 */ typedef struct UTransPosition { /** * Beginning index, inclusive, of the context to be considered for * a transliteration operation. The transliterator will ignore * anything before this index. INPUT/OUTPUT parameter: This parameter * is updated by a transliteration operation to reflect the maximum * amount of antecontext needed by a transliterator. * @stable ICU 2.4 */ int32_t contextStart; /** * Ending index, exclusive, of the context to be considered for a * transliteration operation. The transliterator will ignore * anything at or after this index. INPUT/OUTPUT parameter: This * parameter is updated to reflect changes in the length of the * text, but points to the same logical position in the text. * @stable ICU 2.4 */ int32_t contextLimit; /** * Beginning index, inclusive, of the text to be transliterated. * INPUT/OUTPUT parameter: This parameter is advanced past * characters that have already been transliterated by a * transliteration operation. * @stable ICU 2.4 */ int32_t start; /** * Ending index, exclusive, of the text to be transliterated. * INPUT/OUTPUT parameter: This parameter is updated to reflect * changes in the length of the text, but points to the same * logical position in the text. * @stable ICU 2.4 */ int32_t limit; } UTransPosition; /******************************************************************** * General API ********************************************************************/ /** * Open a custom transliterator, given a custom rules string * OR * a system transliterator, given its ID. * Any non-NULL result from this function should later be closed with * utrans_close(). * * @param id a valid transliterator ID * @param idLength the length of the ID string, or -1 if NUL-terminated * @param dir the desired direction * @param rules the transliterator rules. See the C++ header rbt.h for * rules syntax. If NULL then a system transliterator matching * the ID is returned. * @param rulesLength the length of the rules, or -1 if the rules * are NUL-terminated. * @param parseError a pointer to a UParseError struct to receive the details * of any parsing errors. This parameter may be NULL if no * parsing error details are desired. * @param pErrorCode a pointer to the UErrorCode * @return a transliterator pointer that may be passed to other * utrans_xxx() functions, or NULL if the open call fails. * @stable ICU 2.8 */ U_CAPI UTransliterator* U_EXPORT2 utrans_openU(const UChar *id, int32_t idLength, UTransDirection dir, const UChar *rules, int32_t rulesLength, UParseError *parseError, UErrorCode *pErrorCode); /** * Open an inverse of an existing transliterator. For this to work, * the inverse must be registered with the system. For example, if * the Transliterator "A-B" is opened, and then its inverse is opened, * the result is the Transliterator "B-A", if such a transliterator is * registered with the system. Otherwise the result is NULL and a * failing UErrorCode is set. Any non-NULL result from this function * should later be closed with utrans_close(). * * @param trans the transliterator to open the inverse of. * @param status a pointer to the UErrorCode * @return a pointer to a newly-opened transliterator that is the * inverse of trans, or NULL if the open call fails. * @stable ICU 2.0 */ U_CAPI UTransliterator* U_EXPORT2 utrans_openInverse(const UTransliterator* trans, UErrorCode* status); /** * Create a copy of a transliterator. Any non-NULL result from this * function should later be closed with utrans_close(). * * @param trans the transliterator to be copied. * @param status a pointer to the UErrorCode * @return a transliterator pointer that may be passed to other * utrans_xxx() functions, or NULL if the clone call fails. * @stable ICU 2.0 */ U_CAPI UTransliterator* U_EXPORT2 utrans_clone(const UTransliterator* trans, UErrorCode* status); /** * Close a transliterator. Any non-NULL pointer returned by * utrans_openXxx() or utrans_clone() should eventually be closed. * @param trans the transliterator to be closed. * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 utrans_close(UTransliterator* trans); /** * Return the programmatic identifier for this transliterator. * If this identifier is passed to utrans_openU(), it will open * a transliterator equivalent to this one, if the ID has been * registered. * * @param trans the transliterator to return the ID of. * @param resultLength pointer to an output variable receiving the length * of the ID string; can be NULL * @return the NUL-terminated ID string. This pointer remains * valid until utrans_close() is called on this transliterator. * * @stable ICU 2.8 */ U_CAPI const UChar * U_EXPORT2 utrans_getUnicodeID(const UTransliterator *trans, int32_t *resultLength); /** * Register an open transliterator with the system. When * utrans_open() is called with an ID string that is equal to that * returned by utrans_getID(adoptedTrans,...), then * utrans_clone(adoptedTrans,...) is returned. * *

NOTE: After this call the system owns the adoptedTrans and will * close it. The user must not call utrans_close() on adoptedTrans. * * @param adoptedTrans a transliterator, typically the result of * utrans_openRules(), to be registered with the system. * @param status a pointer to the UErrorCode * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 utrans_register(UTransliterator* adoptedTrans, UErrorCode* status); /** * Unregister a transliterator from the system. After this call the * system will no longer recognize the given ID when passed to * utrans_open(). If the ID is invalid then nothing is done. * * @param id an ID to unregister * @param idLength the length of id, or -1 if id is zero-terminated * @stable ICU 2.8 */ U_CAPI void U_EXPORT2 utrans_unregisterID(const UChar* id, int32_t idLength); /** * Set the filter used by a transliterator. A filter can be used to * make the transliterator pass certain characters through untouched. * The filter is expressed using a UnicodeSet pattern. If the * filterPattern is NULL or the empty string, then the transliterator * will be reset to use no filter. * * @param trans the transliterator * @param filterPattern a pattern string, in the form accepted by * UnicodeSet, specifying which characters to apply the * transliteration to. May be NULL or the empty string to indicate no * filter. * @param filterPatternLen the length of filterPattern, or -1 if * filterPattern is zero-terminated * @param status a pointer to the UErrorCode * @see UnicodeSet * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 utrans_setFilter(UTransliterator* trans, const UChar* filterPattern, int32_t filterPatternLen, UErrorCode* status); /** * Return the number of system transliterators. * It is recommended to use utrans_openIDs() instead. * * @return the number of system transliterators. * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 utrans_countAvailableIDs(void); /** * Return a UEnumeration for the available transliterators. * * @param pErrorCode Pointer to the UErrorCode in/out parameter. * @return UEnumeration for the available transliterators. * Close with uenum_close(). * * @stable ICU 2.8 */ U_CAPI UEnumeration * U_EXPORT2 utrans_openIDs(UErrorCode *pErrorCode); /******************************************************************** * Transliteration API ********************************************************************/ /** * Transliterate a segment of a UReplaceable string. The string is * passed in as a UReplaceable pointer rep and a UReplaceableCallbacks * function pointer struct repFunc. Functions in the repFunc struct * will be called in order to modify the rep string. * * @param trans the transliterator * @param rep a pointer to the string. This will be passed to the * repFunc functions. * @param repFunc a set of function pointers that will be used to * modify the string pointed to by rep. * @param start the beginning index, inclusive; 0 <= start <= * limit. * @param limit pointer to the ending index, exclusive; start <= * limit <= repFunc->length(rep). Upon return, *limit will * contain the new limit index. The text previously occupying * [start, limit) has been transliterated, possibly to a * string of a different length, at [start, * new-limit), where new-limit * is the return value. * @param status a pointer to the UErrorCode * @stable ICU 2.0 */ #if (NTDDI_VERSION >= NTDDI_WIN10_RS5) U_CAPI void U_EXPORT2 utrans_trans(const UTransliterator* trans, UReplaceable* rep, const UReplaceableCallbacks* repFunc, int32_t start, int32_t* limit, UErrorCode* status); #elif (NTDDI_VERSION >= NTDDI_WIN10_RS3) U_CAPI void U_EXPORT2 utrans_trans(const UTransliterator* trans, UReplaceable* rep, UReplaceableCallbacks* repFunc, int32_t start, int32_t* limit, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Transliterate the portion of the UReplaceable text buffer that can * be transliterated unambiguously. This method is typically called * after new text has been inserted, e.g. as a result of a keyboard * event. The transliterator will try to transliterate characters of * rep between index.cursor and * index.limit. Characters before * index.cursor will not be changed. * *

Upon return, values in index will be updated. * index.start will be advanced to the first * character that future calls to this method will read. * index.cursor and index.limit will * be adjusted to delimit the range of text that future calls to * this method may change. * *

Typical usage of this method begins with an initial call * with index.start and index.limit * set to indicate the portion of text to be * transliterated, and index.cursor == index.start. * Thereafter, index can be used without * modification in future calls, provided that all changes to * text are made via this method. * *

This method assumes that future calls may be made that will * insert new text into the buffer. As a result, it only performs * unambiguous transliterations. After the last call to this method, * there may be untransliterated text that is waiting for more input * to resolve an ambiguity. In order to perform these pending * transliterations, clients should call utrans_trans() with a start * of index.start and a limit of index.end after the last call to this * method has been made. * * @param trans the transliterator * @param rep a pointer to the string. This will be passed to the * repFunc functions. * @param repFunc a set of function pointers that will be used to * modify the string pointed to by rep. * @param pos a struct containing the start and limit indices of the * text to be read and the text to be transliterated * @param status a pointer to the UErrorCode * @stable ICU 2.0 */ #if (NTDDI_VERSION >= NTDDI_WIN10_RS5) U_CAPI void U_EXPORT2 utrans_transIncremental(const UTransliterator* trans, UReplaceable* rep, const UReplaceableCallbacks* repFunc, UTransPosition* pos, UErrorCode* status); #elif (NTDDI_VERSION >= NTDDI_WIN10_RS3) U_CAPI void U_EXPORT2 utrans_transIncremental(const UTransliterator* trans, UReplaceable* rep, UReplaceableCallbacks* repFunc, UTransPosition* pos, UErrorCode* status); #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS5) /** * Transliterate a segment of a UChar* string. The string is passed * in in a UChar* buffer. The string is modified in place. If the * result is longer than textCapacity, it is truncated. The actual * length of the result is returned in *textLength, if textLength is * non-NULL. *textLength may be greater than textCapacity, but only * textCapacity UChars will be written to *text, including the zero * terminator. * * @param trans the transliterator * @param text a pointer to a buffer containing the text to be * transliterated on input and the result text on output. * @param textLength a pointer to the length of the string in text. * If the length is -1 then the string is assumed to be * zero-terminated. Upon return, the new length is stored in * *textLength. If textLength is NULL then the string is assumed to * be zero-terminated. * @param textCapacity the length of the text buffer * @param start the beginning index, inclusive; 0 <= start <= * limit. * @param limit pointer to the ending index, exclusive; start <= * limit <= repFunc->length(rep). Upon return, *limit will * contain the new limit index. The text previously occupying * [start, limit) has been transliterated, possibly to a * string of a different length, at [start, * new-limit), where new-limit * is the return value. * @param status a pointer to the UErrorCode * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 utrans_transUChars(const UTransliterator* trans, UChar* text, int32_t* textLength, int32_t textCapacity, int32_t start, int32_t* limit, UErrorCode* status); /** * Transliterate the portion of the UChar* text buffer that can be * transliterated unambiguously. See utrans_transIncremental(). The * string is passed in in a UChar* buffer. The string is modified in * place. If the result is longer than textCapacity, it is truncated. * The actual length of the result is returned in *textLength, if * textLength is non-NULL. *textLength may be greater than * textCapacity, but only textCapacity UChars will be written to * *text, including the zero terminator. See utrans_transIncremental() * for usage details. * * @param trans the transliterator * @param text a pointer to a buffer containing the text to be * transliterated on input and the result text on output. * @param textLength a pointer to the length of the string in text. * If the length is -1 then the string is assumed to be * zero-terminated. Upon return, the new length is stored in * *textLength. If textLength is NULL then the string is assumed to * be zero-terminated. * @param textCapacity the length of the text buffer * @param pos a struct containing the start and limit indices of the * text to be read and the text to be transliterated * @param status a pointer to the UErrorCode * @see utrans_transIncremental * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 utrans_transIncrementalUChars(const UTransliterator* trans, UChar* text, int32_t* textLength, int32_t textCapacity, UTransPosition* pos, UErrorCode* status); /** * Create a rule string that can be passed to utrans_openU to recreate this * transliterator. * * @param trans The transliterator * @param escapeUnprintable if true then convert unprintable characters to their * hex escape representations, \\uxxxx or \\Uxxxxxxxx. * Unprintable characters are those other than * U+000A, U+0020..U+007E. * @param result A pointer to a buffer to receive the rules. * @param resultLength The maximum size of result. * @param status A pointer to the UErrorCode. In case of error status, the * contents of result are undefined. * @return int32_t The length of the rule string (may be greater than resultLength, * in which case an error is returned). * @stable ICU 53 */ U_CAPI int32_t U_EXPORT2 utrans_toRules( const UTransliterator* trans, UBool escapeUnprintable, UChar* result, int32_t resultLength, UErrorCode* status); /** * Returns the set of all characters that may be modified in the input text by * this UTransliterator, optionally ignoring the transliterator's current filter. * @param trans The transliterator. * @param ignoreFilter If false, the returned set incorporates the * UTransliterator's current filter; if the filter is changed, * the return value of this function will change. If true, the * returned set ignores the effect of the UTransliterator's * current filter. * @param fillIn Pointer to a USet object to receive the modifiable characters * set. Previous contents of fillIn are lost. If fillIn is * NULL, then a new USet is created and returned. The caller * owns the result and must dispose of it by calling uset_close. * @param status A pointer to the UErrorCode. * @return USet* Either fillIn, or if fillIn is NULL, a pointer to a * newly-allocated USet that the user must close. In case of * error, NULL is returned. * @stable ICU 53 */ U_CAPI USet* U_EXPORT2 utrans_getSourceSet(const UTransliterator* trans, UBool ignoreFilter, USet* fillIn, UErrorCode* status); /* deprecated API ----------------------------------------------------------- */ #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif // vtzone.h // No supported content #endif /* NTDDI_WIN10_RS3 */