office-gobmx/i18nutil/source/utility/unicode.cxx

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */

#include <com/sun/star/i18n/UnicodeType.hpp>
#include <com/sun/star/i18n/ScriptType.hpp>
#include <i18nlangtag/languagetag.hxx>
#include <i18nlangtag/languagetagicu.hxx>
#include <i18nutil/unicode.hxx>
#include <sal/log.hxx>
#include <unicode/numfmt.h>
#include <unicode/uchar.h>
#include "unicode_data.h"
#include <rtl/character.hxx>
#include <o3tl/string_view.hxx>
#include <memory>

// Workaround for glibc braindamage:
// glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
// which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
#undef CURRENCY_SYMBOL

using namespace ::com::sun::star::i18n;

template<class L, typename T>
static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {

    sal_Int16 i = 0;
    css::i18n::UnicodeScript type = typeList[0].to;
    while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
        type = typeList[++i].to;
    }

    return (type < UnicodeScript_kScriptCount &&
            ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
            typeList[i].value : unknownType;
}

sal_Int16
unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
    return getScriptType(ch, typeList, unknownType);
}

sal_Unicode
unicode::getUnicodeScriptStart( UnicodeScript type) {
    return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
}

sal_Unicode
unicode::getUnicodeScriptEnd( UnicodeScript type) {
    return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
}

sal_Int16
unicode::getUnicodeType(const sal_uInt32 ch)
{
    static sal_uInt32 c = 0x00;
    static sal_uInt32 r = 0x00;

    if (ch == c) return r;
    else c = ch;

    switch (u_charType(ch))
    {
        case U_UNASSIGNED:
            r = css::i18n::UnicodeType::UNASSIGNED;
            break;
        case U_UPPERCASE_LETTER:
            r = css::i18n::UnicodeType::UPPERCASE_LETTER;
            break;
        case U_LOWERCASE_LETTER:
            r = css::i18n::UnicodeType::LOWERCASE_LETTER;
            break;
        case U_TITLECASE_LETTER:
            r = css::i18n::UnicodeType::TITLECASE_LETTER;
            break;
        case U_MODIFIER_LETTER:
            r = css::i18n::UnicodeType::MODIFIER_LETTER;
            break;
        case U_OTHER_LETTER:
            r = css::i18n::UnicodeType::OTHER_LETTER;
            break;
        case U_NON_SPACING_MARK:
            r = css::i18n::UnicodeType::NON_SPACING_MARK;
            break;
        case U_ENCLOSING_MARK:
            r = css::i18n::UnicodeType::ENCLOSING_MARK;
            break;
        case U_COMBINING_SPACING_MARK:
            r = css::i18n::UnicodeType::COMBINING_SPACING_MARK;
            break;
        case U_DECIMAL_DIGIT_NUMBER:
            r = css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER;
            break;
        case U_LETTER_NUMBER:
            r = css::i18n::UnicodeType::LETTER_NUMBER;
            break;
        case U_OTHER_NUMBER:
            r = css::i18n::UnicodeType::OTHER_NUMBER;
            break;
        case U_SPACE_SEPARATOR:
            r = css::i18n::UnicodeType::SPACE_SEPARATOR;
            break;
        case U_LINE_SEPARATOR:
            r = css::i18n::UnicodeType::LINE_SEPARATOR;
            break;
        case U_PARAGRAPH_SEPARATOR:
            r = css::i18n::UnicodeType::PARAGRAPH_SEPARATOR;
            break;
        case U_CONTROL_CHAR:
            r = css::i18n::UnicodeType::CONTROL;
            break;
        case U_FORMAT_CHAR:
            r = css::i18n::UnicodeType::FORMAT;
            break;
        case U_PRIVATE_USE_CHAR:
            r = css::i18n::UnicodeType::PRIVATE_USE;
            break;
        case U_SURROGATE:
            r = css::i18n::UnicodeType::SURROGATE;
            break;
        case U_DASH_PUNCTUATION:
            r = css::i18n::UnicodeType::DASH_PUNCTUATION;
            break;
        case U_INITIAL_PUNCTUATION:
            r = css::i18n::UnicodeType::INITIAL_PUNCTUATION;
            break;
        case U_FINAL_PUNCTUATION:
            r = css::i18n::UnicodeType::FINAL_PUNCTUATION;
            break;
        case U_CONNECTOR_PUNCTUATION:
            r = css::i18n::UnicodeType::CONNECTOR_PUNCTUATION;
            break;
        case U_OTHER_PUNCTUATION:
            r = css::i18n::UnicodeType::OTHER_PUNCTUATION;
            break;
        case U_MATH_SYMBOL:
            r = css::i18n::UnicodeType::MATH_SYMBOL;
            break;
        case U_CURRENCY_SYMBOL:
            r = css::i18n::UnicodeType::CURRENCY_SYMBOL;
            break;
        case U_MODIFIER_SYMBOL:
            r = css::i18n::UnicodeType::MODIFIER_SYMBOL;
            break;
        case U_OTHER_SYMBOL:
            r = css::i18n::UnicodeType::OTHER_SYMBOL;
            break;
        case U_START_PUNCTUATION:
            r = css::i18n::UnicodeType::START_PUNCTUATION;
            break;
        case U_END_PUNCTUATION:
            r = css::i18n::UnicodeType::END_PUNCTUATION;
            break;
    }

    return r;
}

sal_uInt8
unicode::getUnicodeDirection( const sal_Unicode ch ) {
    static sal_Unicode c = 0x00;
    static sal_uInt8 r = 0x00;

    if (ch == c) return r;
    else c = ch;

    sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
    r = (address < UnicodeDirectionNumberBlock)
            ? UnicodeDirectionBlockValue[address]
            : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
    return r;
}

sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) {
    nChar = u_charMirror(nChar);
    return nChar;
}

#define bit(name)   (1U << name)

#define UPPERMASK   bit(UnicodeType::UPPERCASE_LETTER)

#define LOWERMASK   bit(UnicodeType::LOWERCASE_LETTER)

#define TITLEMASK   bit(UnicodeType::TITLECASE_LETTER)

#define ALPHAMASK   UPPERMASK|LOWERMASK|TITLEMASK|\
            bit(UnicodeType::MODIFIER_LETTER)|\
            bit(UnicodeType::OTHER_LETTER)

#define SPACEMASK   bit(UnicodeType::SPACE_SEPARATOR)|\
            bit(UnicodeType::LINE_SEPARATOR)|\
            bit(UnicodeType::PARAGRAPH_SEPARATOR)

#define CONTROLMASK bit(UnicodeType::CONTROL)|\
            bit(UnicodeType::FORMAT)|\
            bit(UnicodeType::LINE_SEPARATOR)|\
            bit(UnicodeType::PARAGRAPH_SEPARATOR)

#define IsType(func, mask)  \
bool func( const sal_uInt32 ch) {\
    return (bit(getUnicodeType(ch)) & (mask)) != 0;\
}

IsType(unicode::isControl, CONTROLMASK)
IsType(unicode::isAlpha, ALPHAMASK)
IsType(unicode::isSpace, SPACEMASK)

#define CONTROLSPACE    bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
            bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)

bool unicode::isWhiteSpace(const sal_uInt32 ch)
{
    return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
}

sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
{
    //See unicode/uscript.h
    sal_Int16 nRet;
    switch (eScript)
    {
        case USCRIPT_INVALID_CODE:
        case USCRIPT_COMMON:
        case USCRIPT_INHERITED:
        case USCRIPT_UNWRITTEN_LANGUAGES:
        case USCRIPT_UNKNOWN:
        case USCRIPT_MATHEMATICAL_NOTATION:
        case USCRIPT_SYMBOLS:
        case USCRIPT_CODE_LIMIT:
            nRet = ScriptType::WEAK;
            break;
        case USCRIPT_ARMENIAN:
        case USCRIPT_CHEROKEE:
        case USCRIPT_COPTIC:
        case USCRIPT_CYRILLIC:
        case USCRIPT_GEORGIAN:
        case USCRIPT_GOTHIC:
        case USCRIPT_GREEK:
        case USCRIPT_LATIN:
        case USCRIPT_OGHAM:
        case USCRIPT_OLD_ITALIC:
        case USCRIPT_RUNIC:
        case USCRIPT_CANADIAN_ABORIGINAL:
        case USCRIPT_BRAILLE:
        case USCRIPT_CYPRIOT:
        case USCRIPT_OSMANYA:
        case USCRIPT_SHAVIAN:
        case USCRIPT_KATAKANA_OR_HIRAGANA:
        case USCRIPT_GLAGOLITIC:
        case USCRIPT_CIRTH:
        case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
        case USCRIPT_OLD_HUNGARIAN:
        case USCRIPT_LATIN_FRAKTUR:
        case USCRIPT_LATIN_GAELIC:
            nRet = ScriptType::LATIN;
            break;
        case USCRIPT_BOPOMOFO:
        case USCRIPT_HAN:
        case USCRIPT_HANGUL:
        case USCRIPT_HIRAGANA:
        case USCRIPT_KATAKANA:
        case USCRIPT_YI:
        case USCRIPT_SIMPLIFIED_HAN:
        case USCRIPT_TRADITIONAL_HAN:
        case USCRIPT_JAPANESE:
        case USCRIPT_KOREAN:
        case USCRIPT_TANGUT:
        case USCRIPT_KHITAN_SMALL_SCRIPT:
            nRet = ScriptType::ASIAN;
            break;
        case USCRIPT_ARABIC:
        case USCRIPT_BENGALI:
        case USCRIPT_DESERET:
        case USCRIPT_DEVANAGARI:
        case USCRIPT_ETHIOPIC:
        case USCRIPT_GUJARATI:
        case USCRIPT_GURMUKHI:
        case USCRIPT_HEBREW:
        case USCRIPT_KANNADA:
        case USCRIPT_KHMER:
        case USCRIPT_LAO:
        case USCRIPT_MALAYALAM:
        case USCRIPT_MONGOLIAN:
        case USCRIPT_MYANMAR:
        case USCRIPT_ORIYA:
        case USCRIPT_SINHALA:
        case USCRIPT_SYRIAC:
        case USCRIPT_TAMIL:
        case USCRIPT_TELUGU:
        case USCRIPT_THAANA:
        case USCRIPT_THAI:
        case USCRIPT_TIBETAN:
        case USCRIPT_TAGALOG:
        case USCRIPT_HANUNOO:
        case USCRIPT_BUHID:
        case USCRIPT_TAGBANWA:
        case USCRIPT_LIMBU:
        case USCRIPT_LINEAR_B:
        case USCRIPT_TAI_LE:
        case USCRIPT_UGARITIC:
        case USCRIPT_BUGINESE:
        case USCRIPT_KHAROSHTHI:
        case USCRIPT_SYLOTI_NAGRI:
        case USCRIPT_NEW_TAI_LUE:
        case USCRIPT_TIFINAGH:
        case USCRIPT_OLD_PERSIAN:
        case USCRIPT_BALINESE:
        case USCRIPT_BATAK:
        case USCRIPT_BLISSYMBOLS:
        case USCRIPT_BRAHMI:
        case USCRIPT_CHAM:
        case USCRIPT_DEMOTIC_EGYPTIAN:
        case USCRIPT_HIERATIC_EGYPTIAN:
        case USCRIPT_EGYPTIAN_HIEROGLYPHS:
        case USCRIPT_KHUTSURI:
        case USCRIPT_PAHAWH_HMONG:
        case USCRIPT_HARAPPAN_INDUS:
        case USCRIPT_JAVANESE:
        case USCRIPT_KAYAH_LI:
        case USCRIPT_LEPCHA:
        case USCRIPT_LINEAR_A:
        case USCRIPT_MANDAEAN:
        case USCRIPT_MAYAN_HIEROGLYPHS:
        case USCRIPT_MEROITIC:
        case USCRIPT_NKO:
        case USCRIPT_ORKHON:
        case USCRIPT_OLD_PERMIC:
        case USCRIPT_PHAGS_PA:
        case USCRIPT_PHOENICIAN:
        case USCRIPT_PHONETIC_POLLARD:
        case USCRIPT_RONGORONGO:
        case USCRIPT_SARATI:
        case USCRIPT_ESTRANGELO_SYRIAC:
        case USCRIPT_WESTERN_SYRIAC:
        case USCRIPT_EASTERN_SYRIAC:
        case USCRIPT_TENGWAR:
        case USCRIPT_VAI:
        case USCRIPT_VISIBLE_SPEECH:
        case USCRIPT_CUNEIFORM:
        case USCRIPT_CARIAN:
        case USCRIPT_LANNA:
        case USCRIPT_LYCIAN:
        case USCRIPT_LYDIAN:
        case USCRIPT_OL_CHIKI:
        case USCRIPT_REJANG:
        case USCRIPT_SAURASHTRA:
        case USCRIPT_SIGN_WRITING:
        case USCRIPT_SUNDANESE:
        case USCRIPT_MOON:
        case USCRIPT_MEITEI_MAYEK:
        case USCRIPT_IMPERIAL_ARAMAIC:
        case USCRIPT_AVESTAN:
        case USCRIPT_CHAKMA:
        case USCRIPT_KAITHI:
        case USCRIPT_MANICHAEAN:
        case USCRIPT_INSCRIPTIONAL_PAHLAVI:
        case USCRIPT_PSALTER_PAHLAVI:
        case USCRIPT_BOOK_PAHLAVI:
        case USCRIPT_INSCRIPTIONAL_PARTHIAN:
        case USCRIPT_SAMARITAN:
        case USCRIPT_TAI_VIET:
        case USCRIPT_BAMUM:
        case USCRIPT_LISU:
        case USCRIPT_NAKHI_GEBA:
        case USCRIPT_OLD_SOUTH_ARABIAN:
        case USCRIPT_BASSA_VAH:
        case USCRIPT_DUPLOYAN_SHORTAND:
        case USCRIPT_ELBASAN:
        case USCRIPT_GRANTHA:
        case USCRIPT_KPELLE:
        case USCRIPT_LOMA:
        case USCRIPT_MENDE:
        case USCRIPT_MEROITIC_CURSIVE:
        case USCRIPT_OLD_NORTH_ARABIAN:
        case USCRIPT_NABATAEAN:
        case USCRIPT_PALMYRENE:
        case USCRIPT_SINDHI:
        case USCRIPT_WARANG_CITI:
        default:         // anything new is going to be pretty wild
            nRet = ScriptType::COMPLEX;
            break;
    }
    return nRet;
}

sal_Int16 unicode::getScriptClassFromLanguageTag( const LanguageTag& rLanguageTag )
{
    constexpr int32_t nBuf = 42;
    UScriptCode aBuf[nBuf];
    if (rLanguageTag.hasScript())
    {
        aBuf[0] = static_cast<UScriptCode>(u_getPropertyValueEnum( UCHAR_SCRIPT,
                OUStringToOString( rLanguageTag.getScript(), RTL_TEXTENCODING_ASCII_US).getStr()));
    }
    else
    {
        OUString aName;
        if (rLanguageTag.getCountry().isEmpty())
            aName = rLanguageTag.getLanguage();
        else
            aName = rLanguageTag.getLanguage() + "-" + rLanguageTag.getCountry();
        UErrorCode status = U_ZERO_ERROR;
        const int32_t nScripts = uscript_getCode(
                OUStringToOString( aName, RTL_TEXTENCODING_ASCII_US).getStr(),
                aBuf, nBuf, &status);
        // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer
        // and required capacity returned, but really..
        if (nScripts == 0 || !U_SUCCESS(status))
            return css::i18n::ScriptType::LATIN;
    }
    return getScriptClassFromUScriptCode( aBuf[0]);
}

OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
{
    OString sRet;
    switch (eScript)
    {
        case USCRIPT_CODE_LIMIT:
        case USCRIPT_INVALID_CODE:
            sRet = "zxx"_ostr;
            break;
        case USCRIPT_COMMON:
        case USCRIPT_INHERITED:
            sRet = "und"_ostr;
            break;
        case USCRIPT_MATHEMATICAL_NOTATION:
        case USCRIPT_SYMBOLS:
            sRet = "zxx"_ostr;
            break;
        case USCRIPT_UNWRITTEN_LANGUAGES:
        case USCRIPT_UNKNOWN:
            sRet = "und"_ostr;
            break;
        case USCRIPT_ARABIC:
            sRet = "ar"_ostr;
            break;
        case USCRIPT_ARMENIAN:
            sRet = "hy"_ostr;
            break;
        case USCRIPT_BENGALI:
            sRet = "bn"_ostr;
            break;
        case USCRIPT_BOPOMOFO:
            sRet = "zh"_ostr;
            break;
        case USCRIPT_CHEROKEE:
            sRet = "chr"_ostr;
            break;
        case USCRIPT_COPTIC:
            sRet = "cop"_ostr;
            break;
        case USCRIPT_CYRILLIC:
            sRet = "ru"_ostr;
            break;
        case USCRIPT_DESERET:
            sRet = "en"_ostr;
            break;
        case USCRIPT_DEVANAGARI:
            sRet = "hi"_ostr;
            break;
        case USCRIPT_ETHIOPIC:
            sRet = "am"_ostr;
            break;
        case USCRIPT_GEORGIAN:
            sRet = "ka"_ostr;
            break;
        case USCRIPT_GOTHIC:
            sRet = "got"_ostr;
            break;
        case USCRIPT_GREEK:
            sRet = "el"_ostr;
            break;
        case USCRIPT_GUJARATI:
            sRet = "gu"_ostr;
            break;
        case USCRIPT_GURMUKHI:
            sRet = "pa"_ostr;
            break;
        case USCRIPT_HAN:
            sRet = "zh"_ostr;
            break;
        case USCRIPT_HANGUL:
            sRet = "ko"_ostr;
            break;
        case USCRIPT_HEBREW:
            sRet = "hr"_ostr;
            break;
        case USCRIPT_HIRAGANA:
            sRet = "ja"_ostr;
            break;
        case USCRIPT_KANNADA:
            sRet = "kn"_ostr;
            break;
        case USCRIPT_KATAKANA:
            sRet = "ja"_ostr;
            break;
        case USCRIPT_KHMER:
            sRet = "km"_ostr;
            break;
        case USCRIPT_LAO:
            sRet = "lo"_ostr;
            break;
        case USCRIPT_LATIN:
            sRet = "en"_ostr;
            break;
        case USCRIPT_MALAYALAM:
            sRet = "ml"_ostr;
            break;
        case USCRIPT_MONGOLIAN:
            sRet = "mn"_ostr;
            break;
        case USCRIPT_MYANMAR:
            sRet = "my"_ostr;
            break;
        case USCRIPT_OGHAM:
            sRet = "pgl"_ostr;
            break;
        case USCRIPT_OLD_ITALIC:
            sRet = "osc"_ostr;
            break;
        case USCRIPT_ORIYA:
            sRet = "or"_ostr;
            break;
        case USCRIPT_RUNIC:
            sRet = "ang"_ostr;
            break;
        case USCRIPT_SINHALA:
            sRet = "si"_ostr;
            break;
        case USCRIPT_SYRIAC:
            sRet = "syr"_ostr;
            break;
        case USCRIPT_TAMIL:
            sRet = "ta"_ostr;
            break;
        case USCRIPT_TELUGU:
            sRet = "te"_ostr;
            break;
        case USCRIPT_THAANA:
            sRet = "dv"_ostr;
            break;
        case USCRIPT_THAI:
            sRet = "th"_ostr;
            break;
        case USCRIPT_TIBETAN:
            sRet = "bo"_ostr;
            break;
        case USCRIPT_CANADIAN_ABORIGINAL:
            sRet = "iu"_ostr;
            break;
        case USCRIPT_YI:
            sRet = "ii"_ostr;
            break;
        case USCRIPT_TAGALOG:
            sRet = "tl"_ostr;
            break;
        case USCRIPT_HANUNOO:
            sRet = "hnn"_ostr;
            break;
        case USCRIPT_BUHID:
            sRet = "bku"_ostr;
            break;
        case USCRIPT_TAGBANWA:
            sRet = "tbw"_ostr;
            break;
        case USCRIPT_BRAILLE:
            sRet = "en"_ostr;
            break;
        case USCRIPT_CYPRIOT:
            sRet = "ecy"_ostr;
            break;
        case USCRIPT_LIMBU:
            sRet = "lif"_ostr;
            break;
        case USCRIPT_LINEAR_B:
            sRet = "gmy"_ostr;
            break;
        case USCRIPT_OSMANYA:
            sRet = "so"_ostr;
            break;
        case USCRIPT_SHAVIAN:
            sRet = "en"_ostr;
            break;
        case USCRIPT_TAI_LE:
            sRet = "tdd"_ostr;
            break;
        case USCRIPT_UGARITIC:
            sRet = "uga"_ostr;
            break;
        case USCRIPT_KATAKANA_OR_HIRAGANA:
            sRet = "ja"_ostr;
            break;
        case USCRIPT_BUGINESE:
            sRet = "bug"_ostr;
            break;
        case USCRIPT_GLAGOLITIC:
            sRet = "ch"_ostr;
            break;
        case USCRIPT_KHAROSHTHI:
            sRet = "pra"_ostr;
            break;
        case USCRIPT_SYLOTI_NAGRI:
            sRet = "syl"_ostr;
            break;
        case USCRIPT_NEW_TAI_LUE:
            sRet = "khb"_ostr;
            break;
        case USCRIPT_TIFINAGH:
            sRet = "tmh"_ostr;
            break;
        case USCRIPT_OLD_PERSIAN:
            sRet = "peo"_ostr;
            break;
        case USCRIPT_BALINESE:
            sRet = "ban"_ostr;
            break;
        case USCRIPT_BATAK:
            sRet = "btk"_ostr;
            break;
        case USCRIPT_BLISSYMBOLS:
            sRet = "en"_ostr;
            break;
        case USCRIPT_BRAHMI:
            sRet = "pra"_ostr;
            break;
        case USCRIPT_CHAM:
            sRet = "cja"_ostr;
            break;
        case USCRIPT_CIRTH:
            sRet = "sjn"_ostr;
            break;
        case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
            sRet = "cu"_ostr;
            break;
        case USCRIPT_DEMOTIC_EGYPTIAN:
        case USCRIPT_HIERATIC_EGYPTIAN:
        case USCRIPT_EGYPTIAN_HIEROGLYPHS:
            sRet = "egy"_ostr;
            break;
        case USCRIPT_KHUTSURI:
            sRet = "ka"_ostr;
            break;
        case USCRIPT_SIMPLIFIED_HAN:
            sRet = "zh"_ostr;
            break;
        case USCRIPT_TRADITIONAL_HAN:
            sRet = "zh"_ostr;
            break;
        case USCRIPT_PAHAWH_HMONG:
            sRet = "blu"_ostr;
            break;
        case USCRIPT_OLD_HUNGARIAN:
            sRet = "ohu"_ostr;
            break;
        case USCRIPT_HARAPPAN_INDUS:
            sRet = "xiv"_ostr;
            break;
        case USCRIPT_JAVANESE:
            sRet = "kaw"_ostr;
            break;
        case USCRIPT_KAYAH_LI:
            sRet = "eky"_ostr;
            break;
        case USCRIPT_LATIN_FRAKTUR:
            sRet = "de"_ostr;
            break;
        case USCRIPT_LATIN_GAELIC:
            sRet = "ga"_ostr;
            break;
        case USCRIPT_LEPCHA:
            sRet = "lep"_ostr;
            break;
        case USCRIPT_LINEAR_A:
            sRet = "ecr"_ostr;
            break;
        case USCRIPT_MAYAN_HIEROGLYPHS:
            sRet = "myn"_ostr;
            break;
        case USCRIPT_MEROITIC:
            sRet = "xmr"_ostr;
            break;
        case USCRIPT_NKO:
            sRet = "nqo"_ostr;
            break;
        case USCRIPT_ORKHON:
            sRet = "otk"_ostr;
            break;
        case USCRIPT_OLD_PERMIC:
            sRet = "kv"_ostr;
            break;
        case USCRIPT_PHAGS_PA:
            sRet = "xng"_ostr;
            break;
        case USCRIPT_PHOENICIAN:
            sRet = "phn"_ostr;
            break;
        case USCRIPT_PHONETIC_POLLARD:
            sRet = "hmd"_ostr;
            break;
        case USCRIPT_RONGORONGO:
            sRet = "rap"_ostr;
            break;
        case USCRIPT_SARATI:
            sRet = "qya"_ostr;
            break;
        case USCRIPT_ESTRANGELO_SYRIAC:
            sRet = "syr"_ostr;
            break;
        case USCRIPT_WESTERN_SYRIAC:
            sRet = "tru"_ostr;
            break;
        case USCRIPT_EASTERN_SYRIAC:
            sRet = "aii"_ostr;
            break;
        case USCRIPT_TENGWAR:
            sRet = "sjn"_ostr;
            break;
        case USCRIPT_VAI:
            sRet = "vai"_ostr;
            break;
        case USCRIPT_VISIBLE_SPEECH:
            sRet = "en"_ostr;
            break;
        case USCRIPT_CUNEIFORM:
            sRet = "akk"_ostr;
            break;
        case USCRIPT_CARIAN:
            sRet = "xcr"_ostr;
            break;
        case USCRIPT_JAPANESE:
            sRet = "ja"_ostr;
            break;
        case USCRIPT_LANNA:
            sRet = "nod"_ostr;
            break;
        case USCRIPT_LYCIAN:
            sRet = "xlc"_ostr;
            break;
        case USCRIPT_LYDIAN:
            sRet = "xld"_ostr;
            break;
        case USCRIPT_OL_CHIKI:
            sRet = "sat"_ostr;
            break;
        case USCRIPT_REJANG:
            sRet = "rej"_ostr;
            break;
        case USCRIPT_SAURASHTRA:
            sRet = "saz"_ostr;
            break;
        case USCRIPT_SIGN_WRITING:
            sRet = "en"_ostr;
            break;
        case USCRIPT_SUNDANESE:
            sRet = "su"_ostr;
            break;
        case USCRIPT_MOON:
            sRet = "en"_ostr;
            break;
        case USCRIPT_MEITEI_MAYEK:
            sRet = "mni"_ostr;
            break;
        case USCRIPT_IMPERIAL_ARAMAIC:
            sRet = "arc"_ostr;
            break;
        case USCRIPT_AVESTAN:
            sRet = "ae"_ostr;
            break;
        case USCRIPT_CHAKMA:
            sRet = "ccp"_ostr;
            break;
        case USCRIPT_KOREAN:
            sRet = "ko"_ostr;
            break;
        case USCRIPT_KAITHI:
            sRet = "awa"_ostr;
            break;
        case USCRIPT_MANICHAEAN:
            sRet = "xmn"_ostr;
            break;
        case USCRIPT_INSCRIPTIONAL_PAHLAVI:
        case USCRIPT_PSALTER_PAHLAVI:
        case USCRIPT_BOOK_PAHLAVI:
        case USCRIPT_INSCRIPTIONAL_PARTHIAN:
            sRet = "xpr"_ostr;
            break;
        case USCRIPT_SAMARITAN:
            sRet = "heb"_ostr;
            break;
        case USCRIPT_TAI_VIET:
            sRet = "blt"_ostr;
            break;
        case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
            sRet = "mic"_ostr;
            break;
        case USCRIPT_NABATAEAN:
            sRet = "mis-Nbat"_ostr;  // Uncoded with script
            break;
        case USCRIPT_PALMYRENE:
            sRet = "mis-Palm"_ostr;  // Uncoded with script
            break;
        case USCRIPT_BAMUM:
            sRet = "bax"_ostr;
            break;
        case USCRIPT_LISU:
            sRet = "lis"_ostr;
            break;
        case USCRIPT_NAKHI_GEBA:
            sRet = "nxq"_ostr;
            break;
        case USCRIPT_OLD_SOUTH_ARABIAN:
            sRet = "xsa"_ostr;
            break;
        case USCRIPT_BASSA_VAH:
            sRet = "bsq"_ostr;
            break;
        case USCRIPT_DUPLOYAN_SHORTAND:
            sRet = "fr"_ostr;
            break;
        case USCRIPT_ELBASAN:
            sRet = "sq"_ostr;
            break;
        case USCRIPT_GRANTHA:
            sRet = "ta"_ostr;
            break;
        case USCRIPT_KPELLE:
            sRet = "kpe"_ostr;
            break;
        case USCRIPT_LOMA:
            sRet = "lom"_ostr;
            break;
        case USCRIPT_MENDE:
            sRet = "men"_ostr;
            break;
        case USCRIPT_MEROITIC_CURSIVE:
            sRet = "xmr"_ostr;
            break;
        case USCRIPT_OLD_NORTH_ARABIAN:
            sRet = "xna"_ostr;
            break;
        case USCRIPT_SINDHI:
            sRet = "sd"_ostr;
            break;
        case USCRIPT_WARANG_CITI:
            sRet = "hoc"_ostr;
            break;
        case USCRIPT_AFAKA:
            sRet = "djk"_ostr;
            break;
        case USCRIPT_JURCHEN:
            sRet = "juc"_ostr;
            break;
        case USCRIPT_MRO:
            sRet = "cmr"_ostr;
            break;
        case USCRIPT_NUSHU:
            sRet = "mis-Nshu"_ostr;  // Uncoded with script
            break;
        case USCRIPT_SHARADA:
            sRet = "sa"_ostr;
            break;
        case USCRIPT_SORA_SOMPENG:
            sRet = "srb"_ostr;
            break;
        case USCRIPT_TAKRI:
            sRet = "doi"_ostr;
            break;
        case USCRIPT_TANGUT:
            sRet = "txg"_ostr;
            break;
        case USCRIPT_WOLEAI:
            sRet = "woe"_ostr;
            break;
        case USCRIPT_ANATOLIAN_HIEROGLYPHS:
            sRet = "hlu"_ostr;
            break;
        case USCRIPT_KHOJKI:
            sRet = "gu"_ostr;
            break;
        case USCRIPT_TIRHUTA:
            sRet = "mai"_ostr;
            break;
        case USCRIPT_CAUCASIAN_ALBANIAN:
            sRet = "xag"_ostr;
            break;
        case USCRIPT_MAHAJANI:
            sRet = "mwr"_ostr;
            break;
        case USCRIPT_AHOM:
            sRet = "aho"_ostr;
            break;
        case USCRIPT_HATRAN:
            sRet = "qly-Hatr"_ostr;
            break;
        case USCRIPT_MODI:
            sRet = "mr-Modi"_ostr;
            break;
        case USCRIPT_MULTANI:
            sRet = "skr-Mutl"_ostr;
            break;
        case USCRIPT_PAU_CIN_HAU:
            sRet = "ctd-Pauc"_ostr;
            break;
        case USCRIPT_SIDDHAM:
            sRet = "sa-Sidd"_ostr;
            break;
        case USCRIPT_ADLAM:
            sRet = "mis-Adlm"_ostr;   // Adlam for Fulani, no language code
            break;
        case USCRIPT_BHAIKSUKI:
            sRet = "mis-Bhks"_ostr;   // Bhaiksuki for some Buddhist texts, no language code
            break;
        case USCRIPT_MARCHEN:
            sRet = "bo-Marc"_ostr;
            break;
        case USCRIPT_NEWA:
            sRet = "new-Newa"_ostr;
            break;
        case USCRIPT_OSAGE:
            sRet = "osa-Osge"_ostr;
            break;
        case USCRIPT_HAN_WITH_BOPOMOFO:
            sRet = "mis-Hanb"_ostr;   // Han with Bopomofo, zh-Hanb ?
            break;
        case USCRIPT_JAMO:
            sRet = "ko"_ostr;   // Jamo - elements of Hangul Syllables
            break;
        case USCRIPT_SYMBOLS_EMOJI:
            sRet = "mis-Zsye"_ostr;   // Emoji variant
            break;
        case USCRIPT_MASARAM_GONDI:
            sRet = "gon-Gonm"_ostr;  // macro language code, could be wsg,esg,gno
            break;
        case USCRIPT_SOYOMBO:
            sRet = "mn-Soyo"_ostr;   // abugida to write Mongolian, also Tibetan and Sanskrit
            break;
        case USCRIPT_ZANABAZAR_SQUARE:
            sRet = "mn-Zanb"_ostr;   // abugida to write Mongolian
            break;
        case USCRIPT_DOGRA:
            sRet = "dgo"_ostr;       // Dogri proper
            break;
        case USCRIPT_GUNJALA_GONDI:
            sRet = "wsg"_ostr;       // Adilabad Gondi
            break;
        case USCRIPT_MAKASAR:
            sRet = "mak"_ostr;
            break;
        case USCRIPT_MEDEFAIDRIN:
            sRet = "dmf-Medf"_ostr;
            break;
        case USCRIPT_HANIFI_ROHINGYA:
            sRet = "rhg"_ostr;
            break;
        case USCRIPT_SOGDIAN:
            sRet = "sog"_ostr;
            break;
        case USCRIPT_OLD_SOGDIAN:
            sRet = "sog"_ostr;
            break;
        case USCRIPT_ELYMAIC:
            sRet = "arc-Elym"_ostr;
            break;
        case USCRIPT_NYIAKENG_PUACHUE_HMONG:
            sRet = "hmn-Hmnp"_ostr;  // macrolanguage code
            break;
        case USCRIPT_NANDINAGARI:
            sRet = "sa-Nand"_ostr;
            break;
        case USCRIPT_WANCHO:
            sRet = "nnp-Wcho"_ostr;
            break;
        case USCRIPT_CHORASMIAN:
            sRet = "xco-Chrs"_ostr;
            break;
        case USCRIPT_DIVES_AKURU:
            sRet = "dv-Diak"_ostr;
            break;
        case USCRIPT_KHITAN_SMALL_SCRIPT:
            sRet = "zkt-Kits"_ostr;
            break;
        case USCRIPT_YEZIDI:
            sRet = "kmr-Yezi"_ostr;
            break;
#if (U_ICU_VERSION_MAJOR_NUM >= 70)
        case USCRIPT_CYPRO_MINOAN:
            sRet = "mis-Cpmn"_ostr;  // Uncoded with script
            break;
        case USCRIPT_OLD_UYGHUR:
            sRet = "oui-Ougr"_ostr;
            break;
        case USCRIPT_TANGSA:
            sRet = "nst-Tnsa"_ostr;
            break;
        case USCRIPT_TOTO:
            sRet = "txo-Toto"_ostr;
            break;
        case USCRIPT_VITHKUQI:
            sRet = "sq-Vith"_ostr;   // macrolanguage code
            break;
#endif
#if (U_ICU_VERSION_MAJOR_NUM >= 72)
        case USCRIPT_KAWI:
            sRet = "mis-Kawi"_ostr;  // Uncoded with script
            break;
        case USCRIPT_NAG_MUNDARI:
            sRet = "unr-Nagm"_ostr;
            break;
#endif
#if (U_ICU_VERSION_MAJOR_NUM >= 75)
        case USCRIPT_ARABIC_NASTALIQ:
            sRet = "fa-Aran"_ostr;
            break;
#endif
    }
    return sRet;
}

//Format a number as a percentage according to the rules of the given
//language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
OUString unicode::formatPercent(double dNumber,
    const LanguageTag &rLangTag)
{
    // get a currency formatter for this locale ID
    UErrorCode errorCode=U_ZERO_ERROR;

    LanguageTag aLangTag(rLangTag);

    // As of CLDR Version 24 these languages were not listed as using spacing
    // between number and % but are reported as such by our l10n groups
    // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
    // so format using French which has the desired rules
    if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
        aLangTag.reset(u"fr-FR"_ustr);

    icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);

    std::unique_ptr<icu::NumberFormat> xF(
        icu::NumberFormat::createPercentInstance(aLocale, errorCode));
    if(U_FAILURE(errorCode))
    {
        SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
        return OUString::number(dNumber) + "%";
    }

    icu::UnicodeString output;
    xF->format(dNumber/100, output);
    OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
        output.length());
    if (rLangTag.getLanguage() == "de")
    {
        //narrow no-break space instead of (normal) no-break space
        return aRet.replace(0x00A0, 0x202F);
    }
    return aRet;
}

bool ToggleUnicodeCodepoint::AllowMoreInput(sal_uInt32 uChar)
{
    //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
    if( maInput.getLength() > 255 )
        mbAllowMoreChars = false;

    if( !mbAllowMoreChars )
        return false;

    bool bPreventNonHex = false;
    if( maInput.indexOf("U+") != -1 )
        bPreventNonHex = true;

    switch ( unicode::getUnicodeType(uChar) )
    {
        case css::i18n::UnicodeType::SURROGATE:
            if( bPreventNonHex )
            {
                mbAllowMoreChars = false;
                return false;
            }

            if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty()  )
            {
                maUtf16.append(sal_Unicode(uChar));
                return true;
            }
            if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
                maUtf16.insert(0, sal_Unicode(uChar));
            if (maUtf16.getLength() == 2)
            {
                assert(rtl::isHighSurrogate(maUtf16[0]) && rtl::isLowSurrogate(maUtf16[1]));
                // The resulting codepoint may itself be combining, so may allow more
                sal_uInt32 nUCS4 = rtl::combineSurrogates(maUtf16[0], maUtf16[1]);
                maUtf16.setLength(0);
                return AllowMoreInput(nUCS4);
            }
            // unexpected order of high/low, so don't accept more
            if( !maUtf16.isEmpty() )
                maInput.append(maUtf16);
            if( !maCombining.isEmpty() )
                maInput.append(maCombining);
            mbAllowMoreChars = false;
            break;

        case css::i18n::UnicodeType::NON_SPACING_MARK:
        case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
            if( bPreventNonHex )
            {
                mbAllowMoreChars = false;
                return false;
            }

            //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
            if( !maUtf16.isEmpty() )
            {
                maInput = maUtf16;
                if( !maCombining.isEmpty() )
                    maInput.append(maCombining);
                mbAllowMoreChars = false;
                return false;
            }
            maCombining.insertUtf32(0, uChar);
            break;

        default:
            //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
            if( !maUtf16.isEmpty() )
            {
                maInput = maUtf16;
                if( !maCombining.isEmpty() )
                    maInput.append(maCombining);
                mbAllowMoreChars = false;
                return false;
            }

            if( !maCombining.isEmpty() )
            {
                maCombining.insertUtf32(0, uChar);
                maInput = maCombining;
                mbAllowMoreChars = false;
                return false;
            }

            // 0 - 1f are control characters.  Do not process those.
            if( uChar < 0x20 )
            {
                mbAllowMoreChars = false;
                return false;
            }

            switch( uChar )
            {
                case 'u':
                case 'U':
                    // U+ notation found.  Continue looking for another one.
                    if( mbRequiresU )
                    {
                        mbRequiresU = false;
                        maInput.insert(0,"U+");
                    }
                    // treat as a normal character
                    else
                    {
                        mbAllowMoreChars = false;
                        if( !bPreventNonHex )
                            maInput.insertUtf32(0, uChar);
                    }
                    break;
                case '+':
                    // + already found: skip when not U, or edge case of +U+xxxx
                    if( mbRequiresU || (maInput.indexOf("U+") == 0) )
                        mbAllowMoreChars = false;
                    // hex chars followed by '+' - now require a 'U'
                    else if ( !maInput.isEmpty() )
                        mbRequiresU = true;
                    // treat as a normal character
                    else
                    {
                        mbAllowMoreChars = false;
                        if( !bPreventNonHex )
                            maInput.insertUtf32(0, uChar);
                    }
                    break;
                default:
                    // + already found. Since not U, cancel further input
                    if( mbRequiresU )
                        mbAllowMoreChars = false;
                    // maximum digits per notation is 8: only one notation
                    else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
                        mbAllowMoreChars = false;
                    // maximum digits per notation is 8: previous notation found
                    else if( maInput.indexOf("U+") == 8 )
                        mbAllowMoreChars = false;
                    // a hex character. Add to string.
                    else if( rtl::isAsciiHexDigit(uChar) )
                    {
                        mbIsHexString = true;
                        maInput.insertUtf32(0, uChar);
                    }
                    // not a hex character: stop input. keep if it is the first input provided
                    else
                    {
                        mbAllowMoreChars = false;
                        if( maInput.isEmpty() )
                            maInput.insertUtf32(0, uChar);
                    }
            }
    }
    return mbAllowMoreChars;
}

OUString ToggleUnicodeCodepoint::StringToReplace()
{
    if( maInput.isEmpty() )
    {
        //edge case - input finished with incomplete low surrogate or combining characters without a base
        if( mbAllowMoreChars )
        {
            if( !maUtf16.isEmpty() )
                maInput = maUtf16;
            if( !maCombining.isEmpty() )
                maInput.append(maCombining);
        }
        return maInput.toString();
    }

    if( !mbIsHexString )
        return maInput.toString();

    //this function potentially modifies the input string.  Prevent addition of further characters
    mbAllowMoreChars = false;

    //validate unicode notation.
    OUString sIn;
    sal_uInt32 nUnicode = 0;
    sal_Int32 nUPlus = maInput.indexOf("U+");
    //if U+ notation used, strip off all extra chars added not in U+ notation
    if( nUPlus != -1 )
    {
        maInput.remove(0, nUPlus);
        sIn = maInput.copy(2).makeStringAndClear();
        nUPlus = sIn.indexOf("U+");
    }
    else
        sIn = maInput.toString();
    while( nUPlus != -1 )
    {
        nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
        //prevent creating control characters or invalid Unicode values
        if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20  )
            maInput = sIn.subView(nUPlus);
        sIn = sIn.copy(nUPlus+2);
        nUPlus =  sIn.indexOf("U+");
    }

    nUnicode = sIn.toUInt32(16);
    if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
       maInput.truncate().append( sIn[sIn.getLength()-1] );
    return maInput.toString();
}

OUString ToggleUnicodeCodepoint::ReplacementString()
{
    OUString sIn = StringToReplace();
    OUStringBuffer output = "";
    sal_Int32 nUPlus = sIn.indexOf("U+");
    // convert from hex notation to glyph
    if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
    {
        sal_uInt32 nUnicode = 0;
        if( nUPlus == 0)
        {
            sIn = sIn.copy(2);
            nUPlus = sIn.indexOf("U+");
        }
        while( nUPlus > 0 )
        {
            nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
            output.appendUtf32( nUnicode );

            sIn = sIn.copy(nUPlus+2);
            nUPlus = sIn.indexOf("U+");
        }
        nUnicode = sIn.toUInt32(16);
        output.appendUtf32( nUnicode );
    }
    // convert from glyph to hex notation
    else
    {
        sal_Int32 nPos = 0;
        while( nPos < sIn.getLength() )
        {
            OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
            //pad with zeros - minimum length of 4.
            for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
                aTmp.insert( 0,"0" );
            output.append( "U+" + aTmp );
        }
    }
    return output.makeStringAndClear();
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */