29f5742bc8
Commit 776f7e7463
changed cclass_Unicode
to reject group separators in numbers by default, but users are
complaining that the neat "5.000" numbers in their existing documents
are now considered invalid.
* in SwCalc, use GROUP_SEPARATOR_IN_NUMBER
* in cclass_Unicode::parseText(), reject a group separator if it is not
followed by at least 3 digits
With this, a number from tdf#42518 "0.19" is still considered invalid,
while "5.000" is now valid again.
Change-Id: If86f2ed4c27be16f866d7f4cee00789344e9ee2e
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/153047
Tested-by: Michael Stahl <michael.stahl@allotropia.de>
Reviewed-by: Michael Stahl <michael.stahl@allotropia.de>
1076 lines
45 KiB
C++
1076 lines
45 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* This file incorporates work covered by the following license notice:
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
*/
|
|
|
|
|
|
#include <cclass_unicode.hxx>
|
|
#include <unicode/uchar.h>
|
|
#include <rtl/character.hxx>
|
|
#include <rtl/math.hxx>
|
|
#include <rtl/ustring.hxx>
|
|
#include <com/sun/star/i18n/KParseTokens.hpp>
|
|
#include <com/sun/star/i18n/KParseType.hpp>
|
|
#include <com/sun/star/i18n/LocaleData2.hpp>
|
|
#include <com/sun/star/i18n/NativeNumberMode.hpp>
|
|
#include <com/sun/star/i18n/NativeNumberSupplier.hpp>
|
|
|
|
#include <string.h>
|
|
#include <string_view>
|
|
|
|
using namespace ::com::sun::star::uno;
|
|
using namespace ::com::sun::star::i18n;
|
|
using namespace ::com::sun::star::lang;
|
|
|
|
#define TOKEN_DIGIT_FLAGS (ParserFlags::CHAR_VALUE | ParserFlags::VALUE | ParserFlags::VALUE_EXP | ParserFlags::VALUE_EXP_VALUE | ParserFlags::VALUE_DIGIT)
|
|
|
|
namespace i18npool {
|
|
|
|
// Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
|
|
|
|
const sal_uInt8 cclass_Unicode::nDefCnt = 128;
|
|
const ParserFlags cclass_Unicode::pDefaultParserTable[ nDefCnt ] =
|
|
{
|
|
// (...) == Calc formula compiler specific, commented out and modified
|
|
|
|
/* \0 */ ParserFlags::EXCLUDED,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
/* 9 \t */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL)
|
|
ParserFlags::ILLEGAL,
|
|
/* 11 \v */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL)
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
ParserFlags::ILLEGAL,
|
|
/* 32 */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 33 ! */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 34 " */ ParserFlags::CHAR_STRING | ParserFlags::STRING_SEP,
|
|
/* 35 # */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD_SEP)
|
|
/* 36 $ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD)
|
|
/* 37 % */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::VALUE)
|
|
/* 38 & */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 39 ' */ ParserFlags::NAME_SEP,
|
|
/* 40 ( */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 41 ) */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 42 * */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 43 + */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP | ParserFlags::VALUE_EXP | ParserFlags::VALUE_SIGN,
|
|
/* 44 , */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_VALUE | ParserFlags::VALUE)
|
|
/* 45 - */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP | ParserFlags::VALUE_EXP | ParserFlags::VALUE_SIGN,
|
|
/* 46 . */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD | ParserFlags::CHAR_VALUE | ParserFlags::VALUE)
|
|
/* 47 / */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
//for ( i = 48; i < 58; i++ )
|
|
/* 48 0 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 49 1 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 50 2 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 51 3 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 52 4 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 53 5 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 54 6 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 55 7 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 56 8 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 57 9 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
|
|
/* 58 : */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD)
|
|
/* 59 ; */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 60 < */ ParserFlags::CHAR_BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 61 = */ ParserFlags::CHAR | ParserFlags::BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 62 > */ ParserFlags::CHAR_BOOL | ParserFlags::BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 63 ? */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD)
|
|
/* 64 @ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
//for ( i = 65; i < 91; i++ )
|
|
/* 65 A */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 66 B */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 67 C */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 68 D */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 69 E */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 70 F */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 71 G */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 72 H */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 73 I */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 74 J */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 75 K */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 76 L */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 77 M */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 78 N */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 79 O */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 80 P */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 81 Q */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 82 R */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 83 S */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 84 T */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 85 U */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 86 V */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 87 W */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 88 X */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 89 Y */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 90 Z */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 91 [ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
/* 92 \ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
/* 93 ] */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
/* 94 ^ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
|
|
/* 95 _ */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 96 ` */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
//for ( i = 97; i < 123; i++ )
|
|
/* 97 a */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 98 b */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 99 c */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 100 d */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 101 e */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 102 f */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 103 g */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 104 h */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 105 i */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 106 j */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 107 k */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 108 l */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 109 m */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 110 n */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 111 o */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 112 p */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 113 q */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 114 r */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 115 s */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 116 t */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 117 u */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 118 v */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 119 w */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 120 x */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 121 y */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 122 z */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
|
|
/* 123 { */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
/* 124 | */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
/* 125 } */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
/* 126 ~ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
|
|
/* 127 */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP // (ParserFlags::ILLEGAL // UNUSED)
|
|
};
|
|
|
|
|
|
const sal_Int32 cclass_Unicode::pParseTokensType[ nDefCnt ] =
|
|
{
|
|
/* \0 */ KParseTokens::ASC_OTHER,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
/* 9 \t */ KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
/* 11 \v */ KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
KParseTokens::ASC_CONTROL,
|
|
/* 32 */ KParseTokens::ASC_OTHER,
|
|
/* 33 ! */ KParseTokens::ASC_OTHER,
|
|
/* 34 " */ KParseTokens::ASC_OTHER,
|
|
/* 35 # */ KParseTokens::ASC_OTHER,
|
|
/* 36 $ */ KParseTokens::ASC_DOLLAR,
|
|
/* 37 % */ KParseTokens::ASC_OTHER,
|
|
/* 38 & */ KParseTokens::ASC_OTHER,
|
|
/* 39 ' */ KParseTokens::ASC_OTHER,
|
|
/* 40 ( */ KParseTokens::ASC_OTHER,
|
|
/* 41 ) */ KParseTokens::ASC_OTHER,
|
|
/* 42 * */ KParseTokens::ASC_OTHER,
|
|
/* 43 + */ KParseTokens::ASC_OTHER,
|
|
/* 44 , */ KParseTokens::ASC_OTHER,
|
|
/* 45 - */ KParseTokens::ASC_OTHER,
|
|
/* 46 . */ KParseTokens::ASC_DOT,
|
|
/* 47 / */ KParseTokens::ASC_OTHER,
|
|
//for ( i = 48; i < 58; i++ )
|
|
/* 48 0 */ KParseTokens::ASC_DIGIT,
|
|
/* 49 1 */ KParseTokens::ASC_DIGIT,
|
|
/* 50 2 */ KParseTokens::ASC_DIGIT,
|
|
/* 51 3 */ KParseTokens::ASC_DIGIT,
|
|
/* 52 4 */ KParseTokens::ASC_DIGIT,
|
|
/* 53 5 */ KParseTokens::ASC_DIGIT,
|
|
/* 54 6 */ KParseTokens::ASC_DIGIT,
|
|
/* 55 7 */ KParseTokens::ASC_DIGIT,
|
|
/* 56 8 */ KParseTokens::ASC_DIGIT,
|
|
/* 57 9 */ KParseTokens::ASC_DIGIT,
|
|
/* 58 : */ KParseTokens::ASC_COLON,
|
|
/* 59 ; */ KParseTokens::ASC_OTHER,
|
|
/* 60 < */ KParseTokens::ASC_OTHER,
|
|
/* 61 = */ KParseTokens::ASC_OTHER,
|
|
/* 62 > */ KParseTokens::ASC_OTHER,
|
|
/* 63 ? */ KParseTokens::ASC_OTHER,
|
|
/* 64 @ */ KParseTokens::ASC_OTHER,
|
|
//for ( i = 65; i < 91; i++ )
|
|
/* 65 A */ KParseTokens::ASC_UPALPHA,
|
|
/* 66 B */ KParseTokens::ASC_UPALPHA,
|
|
/* 67 C */ KParseTokens::ASC_UPALPHA,
|
|
/* 68 D */ KParseTokens::ASC_UPALPHA,
|
|
/* 69 E */ KParseTokens::ASC_UPALPHA,
|
|
/* 70 F */ KParseTokens::ASC_UPALPHA,
|
|
/* 71 G */ KParseTokens::ASC_UPALPHA,
|
|
/* 72 H */ KParseTokens::ASC_UPALPHA,
|
|
/* 73 I */ KParseTokens::ASC_UPALPHA,
|
|
/* 74 J */ KParseTokens::ASC_UPALPHA,
|
|
/* 75 K */ KParseTokens::ASC_UPALPHA,
|
|
/* 76 L */ KParseTokens::ASC_UPALPHA,
|
|
/* 77 M */ KParseTokens::ASC_UPALPHA,
|
|
/* 78 N */ KParseTokens::ASC_UPALPHA,
|
|
/* 79 O */ KParseTokens::ASC_UPALPHA,
|
|
/* 80 P */ KParseTokens::ASC_UPALPHA,
|
|
/* 81 Q */ KParseTokens::ASC_UPALPHA,
|
|
/* 82 R */ KParseTokens::ASC_UPALPHA,
|
|
/* 83 S */ KParseTokens::ASC_UPALPHA,
|
|
/* 84 T */ KParseTokens::ASC_UPALPHA,
|
|
/* 85 U */ KParseTokens::ASC_UPALPHA,
|
|
/* 86 V */ KParseTokens::ASC_UPALPHA,
|
|
/* 87 W */ KParseTokens::ASC_UPALPHA,
|
|
/* 88 X */ KParseTokens::ASC_UPALPHA,
|
|
/* 89 Y */ KParseTokens::ASC_UPALPHA,
|
|
/* 90 Z */ KParseTokens::ASC_UPALPHA,
|
|
/* 91 [ */ KParseTokens::ASC_OTHER,
|
|
/* 92 \ */ KParseTokens::ASC_OTHER,
|
|
/* 93 ] */ KParseTokens::ASC_OTHER,
|
|
/* 94 ^ */ KParseTokens::ASC_OTHER,
|
|
/* 95 _ */ KParseTokens::ASC_UNDERSCORE,
|
|
/* 96 ` */ KParseTokens::ASC_OTHER,
|
|
//for ( i = 97; i < 123; i++ )
|
|
/* 97 a */ KParseTokens::ASC_LOALPHA,
|
|
/* 98 b */ KParseTokens::ASC_LOALPHA,
|
|
/* 99 c */ KParseTokens::ASC_LOALPHA,
|
|
/* 100 d */ KParseTokens::ASC_LOALPHA,
|
|
/* 101 e */ KParseTokens::ASC_LOALPHA,
|
|
/* 102 f */ KParseTokens::ASC_LOALPHA,
|
|
/* 103 g */ KParseTokens::ASC_LOALPHA,
|
|
/* 104 h */ KParseTokens::ASC_LOALPHA,
|
|
/* 105 i */ KParseTokens::ASC_LOALPHA,
|
|
/* 106 j */ KParseTokens::ASC_LOALPHA,
|
|
/* 107 k */ KParseTokens::ASC_LOALPHA,
|
|
/* 108 l */ KParseTokens::ASC_LOALPHA,
|
|
/* 109 m */ KParseTokens::ASC_LOALPHA,
|
|
/* 110 n */ KParseTokens::ASC_LOALPHA,
|
|
/* 111 o */ KParseTokens::ASC_LOALPHA,
|
|
/* 112 p */ KParseTokens::ASC_LOALPHA,
|
|
/* 113 q */ KParseTokens::ASC_LOALPHA,
|
|
/* 114 r */ KParseTokens::ASC_LOALPHA,
|
|
/* 115 s */ KParseTokens::ASC_LOALPHA,
|
|
/* 116 t */ KParseTokens::ASC_LOALPHA,
|
|
/* 117 u */ KParseTokens::ASC_LOALPHA,
|
|
/* 118 v */ KParseTokens::ASC_LOALPHA,
|
|
/* 119 w */ KParseTokens::ASC_LOALPHA,
|
|
/* 120 x */ KParseTokens::ASC_LOALPHA,
|
|
/* 121 y */ KParseTokens::ASC_LOALPHA,
|
|
/* 122 z */ KParseTokens::ASC_LOALPHA,
|
|
/* 123 { */ KParseTokens::ASC_OTHER,
|
|
/* 124 | */ KParseTokens::ASC_OTHER,
|
|
/* 125 } */ KParseTokens::ASC_OTHER,
|
|
/* 126 ~ */ KParseTokens::ASC_OTHER,
|
|
/* 127 */ KParseTokens::ASC_OTHER
|
|
};
|
|
|
|
|
|
// static
|
|
const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_uInt32 c )
|
|
{
|
|
if ( !pStr )
|
|
return nullptr;
|
|
sal_Unicode cs[2];
|
|
auto const n = rtl::splitSurrogates(c, cs);
|
|
while ( *pStr )
|
|
{
|
|
if ( *pStr == cs[0] && (n == 1 || pStr[1] == cs[1]) )
|
|
return pStr;
|
|
pStr++;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
|
|
sal_Int32 cclass_Unicode::getParseTokensType(sal_uInt32 const c, bool const isFirst)
|
|
{
|
|
if ( c < nDefCnt )
|
|
return pParseTokensType[ sal_uInt8(c) ];
|
|
else
|
|
{
|
|
|
|
//! all KParseTokens::UNI_... must be matched
|
|
switch (u_charType(c))
|
|
{
|
|
case U_UPPERCASE_LETTER :
|
|
return KParseTokens::UNI_UPALPHA;
|
|
case U_LOWERCASE_LETTER :
|
|
return KParseTokens::UNI_LOALPHA;
|
|
case U_TITLECASE_LETTER :
|
|
return KParseTokens::UNI_TITLE_ALPHA;
|
|
case U_MODIFIER_LETTER :
|
|
return KParseTokens::UNI_MODIFIER_LETTER;
|
|
case U_OTHER_LETTER :
|
|
// Non_Spacing_Mark could not be as leading character
|
|
if (isFirst) break;
|
|
[[fallthrough]]; // treat it as Other_Letter.
|
|
case U_NON_SPACING_MARK :
|
|
return KParseTokens::UNI_OTHER_LETTER;
|
|
case U_DECIMAL_DIGIT_NUMBER :
|
|
return KParseTokens::UNI_DIGIT;
|
|
case U_LETTER_NUMBER :
|
|
return KParseTokens::UNI_LETTER_NUMBER;
|
|
case U_OTHER_NUMBER :
|
|
return KParseTokens::UNI_OTHER_NUMBER;
|
|
}
|
|
|
|
return KParseTokens::UNI_OTHER;
|
|
}
|
|
}
|
|
|
|
void cclass_Unicode::setupInternational( const Locale& rLocale )
|
|
{
|
|
bool bChanged = (aParserLocale.Language != rLocale.Language
|
|
|| aParserLocale.Country != rLocale.Country
|
|
|| aParserLocale.Variant != rLocale.Variant);
|
|
if ( bChanged )
|
|
{
|
|
aParserLocale.Language = rLocale.Language;
|
|
aParserLocale.Country = rLocale.Country;
|
|
aParserLocale.Variant = rLocale.Variant;
|
|
}
|
|
if ( !mxLocaleData.is() )
|
|
{
|
|
mxLocaleData.set( LocaleData2::create(m_xContext) );
|
|
}
|
|
}
|
|
|
|
|
|
void cclass_Unicode::setupParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
|
|
const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
|
|
const OUString& userDefinedCharactersCont )
|
|
{
|
|
bool bIntlEqual = (rLocale.Language == aParserLocale.Language &&
|
|
rLocale.Country == aParserLocale.Country &&
|
|
rLocale.Variant == aParserLocale.Variant);
|
|
if ( !pTable || !bIntlEqual ||
|
|
startCharTokenType != nStartTypes ||
|
|
contCharTokenType != nContTypes ||
|
|
userDefinedCharactersStart != aStartChars ||
|
|
userDefinedCharactersCont != aContChars )
|
|
initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart,
|
|
contCharTokenType, userDefinedCharactersCont );
|
|
}
|
|
|
|
|
|
void cclass_Unicode::initParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
|
|
const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
|
|
const OUString& userDefinedCharactersCont )
|
|
{
|
|
// (Re)Init
|
|
setupInternational( rLocale );
|
|
// Memory of pTable is reused.
|
|
if ( !pTable )
|
|
pTable.reset(new ParserFlags[nDefCnt]);
|
|
memcpy( pTable.get(), pDefaultParserTable, sizeof(ParserFlags) * nDefCnt );
|
|
// Start and cont tables only need reallocation if different length.
|
|
if ( pStart && userDefinedCharactersStart.getLength() != aStartChars.getLength() )
|
|
{
|
|
pStart.reset();
|
|
}
|
|
if ( pCont && userDefinedCharactersCont.getLength() != aContChars.getLength() )
|
|
{
|
|
pCont.reset();
|
|
}
|
|
nStartTypes = startCharTokenType;
|
|
nContTypes = contCharTokenType;
|
|
aStartChars = userDefinedCharactersStart;
|
|
aContChars = userDefinedCharactersCont;
|
|
|
|
// specials
|
|
if( mxLocaleData.is() )
|
|
{
|
|
LocaleDataItem2 aItem =
|
|
mxLocaleData->getLocaleItem2( aParserLocale );
|
|
//!TODO: theoretically separators may be a string, adjustment would have to be
|
|
//! done here and in parsing and in ::rtl::math::stringToDouble()
|
|
cGroupSep = aItem.thousandSeparator[0];
|
|
cDecimalSep = aItem.decimalSeparator[0];
|
|
cDecimalSepAlt = aItem.decimalSeparatorAlternative.toChar();
|
|
}
|
|
|
|
if (nContTypes & KParseTokens::GROUP_SEPARATOR_IN_NUMBER)
|
|
{
|
|
if ( cGroupSep < nDefCnt )
|
|
pTable[cGroupSep] |= ParserFlags::VALUE;
|
|
}
|
|
else
|
|
{
|
|
cGroupSep = 0;
|
|
}
|
|
if ( cDecimalSep < nDefCnt )
|
|
pTable[cDecimalSep] |= ParserFlags::CHAR_VALUE | ParserFlags::VALUE;
|
|
if ( cDecimalSepAlt && cDecimalSepAlt < nDefCnt )
|
|
pTable[cDecimalSepAlt] |= ParserFlags::CHAR_VALUE | ParserFlags::VALUE;
|
|
|
|
// Modify characters according to KParseTokens definitions.
|
|
{
|
|
using namespace KParseTokens;
|
|
sal_uInt8 i;
|
|
|
|
if ( !(nStartTypes & ASC_UPALPHA) )
|
|
for ( i = 65; i < 91; i++ )
|
|
pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
|
|
if ( !(nContTypes & ASC_UPALPHA) )
|
|
for ( i = 65; i < 91; i++ )
|
|
pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
|
|
|
|
if ( !(nStartTypes & ASC_LOALPHA) )
|
|
for ( i = 97; i < 123; i++ )
|
|
pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
|
|
if ( !(nContTypes & ASC_LOALPHA) )
|
|
for ( i = 97; i < 123; i++ )
|
|
pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
|
|
|
|
if ( nStartTypes & ASC_DIGIT )
|
|
for ( i = 48; i < 58; i++ )
|
|
pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
|
|
if ( !(nContTypes & ASC_DIGIT) )
|
|
for ( i = 48; i < 58; i++ )
|
|
pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
|
|
|
|
if ( !(nStartTypes & ASC_UNDERSCORE) )
|
|
pTable[95] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
|
|
if ( !(nContTypes & ASC_UNDERSCORE) )
|
|
pTable[95] &= ~ParserFlags::WORD; // not allowed as cont character
|
|
|
|
if ( nStartTypes & ASC_DOLLAR )
|
|
pTable[36] |= ParserFlags::CHAR_WORD; // allowed as start character
|
|
if ( nContTypes & ASC_DOLLAR )
|
|
pTable[36] |= ParserFlags::WORD; // allowed as cont character
|
|
|
|
if ( nStartTypes & ASC_DOT )
|
|
pTable[46] |= ParserFlags::CHAR_WORD; // allowed as start character
|
|
if ( nContTypes & ASC_DOT )
|
|
pTable[46] |= ParserFlags::WORD; // allowed as cont character
|
|
|
|
if ( nStartTypes & ASC_COLON )
|
|
pTable[58] |= ParserFlags::CHAR_WORD; // allowed as start character
|
|
if ( nContTypes & ASC_COLON )
|
|
pTable[58] |= ParserFlags::WORD; // allowed as cont character
|
|
|
|
if ( nStartTypes & ASC_CONTROL )
|
|
for ( i = 1; i < 32; i++ )
|
|
pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
|
|
if ( nContTypes & ASC_CONTROL )
|
|
for ( i = 1; i < 32; i++ )
|
|
pTable[i] |= ParserFlags::WORD; // allowed as cont character
|
|
|
|
if ( nStartTypes & ASC_ANY_BUT_CONTROL )
|
|
for ( i = 32; i < nDefCnt; i++ )
|
|
pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
|
|
if ( nContTypes & ASC_ANY_BUT_CONTROL )
|
|
for ( i = 32; i < nDefCnt; i++ )
|
|
pTable[i] |= ParserFlags::WORD; // allowed as cont character
|
|
|
|
}
|
|
|
|
// Merge in (positively override with) user defined characters.
|
|
// StartChars
|
|
sal_Int32 nLen = aStartChars.getLength();
|
|
if ( nLen )
|
|
{
|
|
if ( !pStart )
|
|
pStart.reset(new ParserFlags[ nLen ]);
|
|
const sal_Unicode* p = aStartChars.getStr();
|
|
for ( sal_Int32 j=0; j<nLen; j++, p++ )
|
|
{
|
|
pStart[j] = ParserFlags::CHAR_WORD;
|
|
if ( *p < nDefCnt )
|
|
pTable[*p] |= ParserFlags::CHAR_WORD;
|
|
}
|
|
}
|
|
// ContChars
|
|
nLen = aContChars.getLength();
|
|
if ( nLen )
|
|
{
|
|
if ( !pCont )
|
|
pCont.reset(new ParserFlags[ nLen ]);
|
|
const sal_Unicode* p = aContChars.getStr();
|
|
for ( sal_Int32 j=0; j<nLen; j++ )
|
|
{
|
|
pCont[j] = ParserFlags::WORD;
|
|
if ( *p < nDefCnt )
|
|
pTable[*p] |= ParserFlags::WORD;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void cclass_Unicode::destroyParserTable()
|
|
{
|
|
pCont.reset();
|
|
pStart.reset();
|
|
pTable.reset();
|
|
}
|
|
|
|
|
|
ParserFlags cclass_Unicode::getFlags(sal_uInt32 const c, const cclass_Unicode::ScanState eState)
|
|
{
|
|
ParserFlags nMask;
|
|
if ( c < nDefCnt )
|
|
nMask = pTable[ sal_uInt8(c) ];
|
|
else
|
|
nMask = getFlagsExtended(c, eState);
|
|
switch ( eState )
|
|
{
|
|
case ssGetChar :
|
|
case ssRewindFromValue :
|
|
case ssIgnoreLeadingInRewind :
|
|
case ssGetWordFirstChar :
|
|
if ( !(nMask & ParserFlags::CHAR_WORD) )
|
|
{
|
|
nMask |= getStartCharsFlags( c );
|
|
if ( nMask & ParserFlags::CHAR_WORD )
|
|
nMask &= ~ParserFlags::EXCLUDED;
|
|
}
|
|
break;
|
|
case ssGetValue :
|
|
case ssGetWord :
|
|
if ( !(nMask & ParserFlags::WORD) )
|
|
{
|
|
nMask |= getContCharsFlags( c );
|
|
if ( nMask & ParserFlags::WORD )
|
|
nMask &= ~ParserFlags::EXCLUDED;
|
|
}
|
|
break;
|
|
default:
|
|
; // other cases aren't needed, no compiler warning
|
|
}
|
|
return nMask;
|
|
}
|
|
|
|
|
|
ParserFlags cclass_Unicode::getFlagsExtended(sal_uInt32 const c, const cclass_Unicode::ScanState eState) const
|
|
{
|
|
if ( c == cGroupSep )
|
|
return ParserFlags::VALUE;
|
|
else if ( c == cDecimalSep )
|
|
return ParserFlags::CHAR_VALUE | ParserFlags::VALUE;
|
|
else if ( cDecimalSepAlt && c == cDecimalSepAlt )
|
|
return ParserFlags::CHAR_VALUE | ParserFlags::VALUE;
|
|
bool bStart = (eState == ssGetChar || eState == ssGetWordFirstChar ||
|
|
eState == ssRewindFromValue || eState == ssIgnoreLeadingInRewind);
|
|
sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes);
|
|
|
|
//! all KParseTokens::UNI_... must be matched
|
|
switch (u_charType(c))
|
|
{
|
|
case U_UPPERCASE_LETTER :
|
|
return (nTypes & KParseTokens::UNI_UPALPHA) ?
|
|
(bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
|
|
ParserFlags::ILLEGAL;
|
|
case U_LOWERCASE_LETTER :
|
|
return (nTypes & KParseTokens::UNI_LOALPHA) ?
|
|
(bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
|
|
ParserFlags::ILLEGAL;
|
|
case U_TITLECASE_LETTER :
|
|
return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ?
|
|
(bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
|
|
ParserFlags::ILLEGAL;
|
|
case U_MODIFIER_LETTER :
|
|
return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ?
|
|
(bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
|
|
ParserFlags::ILLEGAL;
|
|
case U_NON_SPACING_MARK :
|
|
case U_COMBINING_SPACING_MARK :
|
|
// Non_Spacing_Mark can't be a leading character,
|
|
// nor can a spacing combining mark.
|
|
if (bStart)
|
|
return ParserFlags::ILLEGAL;
|
|
[[fallthrough]]; // treat it as Other_Letter.
|
|
case U_OTHER_LETTER :
|
|
return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
|
|
(bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
|
|
ParserFlags::ILLEGAL;
|
|
case U_DECIMAL_DIGIT_NUMBER :
|
|
return ((nTypes & KParseTokens::UNI_DIGIT) ?
|
|
(bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
|
|
ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS;
|
|
case U_LETTER_NUMBER :
|
|
return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ?
|
|
(bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
|
|
ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS;
|
|
case U_OTHER_NUMBER :
|
|
return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ?
|
|
(bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
|
|
ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS;
|
|
case U_SPACE_SEPARATOR :
|
|
return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ?
|
|
ParserFlags::CHAR_DONTCARE : (bStart ? ParserFlags::CHAR_WORD : (ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP) ));
|
|
case U_OTHER_PUNCTUATION:
|
|
// fdo#61754 Lets see (if we not at the start) if this is midletter
|
|
// punctuation and allow it in a word if it is similarly to
|
|
// U_NON_SPACING_MARK, for example U+00B7 MIDDLE DOT.
|
|
// tdf#123575 for U+30FB KATAKANA MIDDLE DOT property is not
|
|
// U_WB_MIDLETTER but U_WB_KATAKANA instead, explicitly test that
|
|
// and U+FF65 HALFWIDTH KATAKANA MIDDLE DOT.
|
|
if (bStart || (U_WB_MIDLETTER != u_getIntPropertyValue(c, UCHAR_WORD_BREAK)
|
|
&& c != 0x30FB && c != 0xFF65))
|
|
return ParserFlags::ILLEGAL;
|
|
else
|
|
{
|
|
//allowing it to continue the word
|
|
return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
|
|
ParserFlags::WORD : ParserFlags::ILLEGAL;
|
|
}
|
|
break;
|
|
}
|
|
|
|
return ParserFlags::ILLEGAL;
|
|
}
|
|
|
|
|
|
ParserFlags cclass_Unicode::getStartCharsFlags( sal_uInt32 c )
|
|
{
|
|
if ( pStart )
|
|
{
|
|
const sal_Unicode* pStr = aStartChars.getStr();
|
|
const sal_Unicode* p = StrChr( pStr, c );
|
|
if ( p )
|
|
return pStart[ p - pStr ];
|
|
}
|
|
return ParserFlags::ILLEGAL;
|
|
}
|
|
|
|
|
|
ParserFlags cclass_Unicode::getContCharsFlags( sal_Unicode c )
|
|
{
|
|
if ( pCont )
|
|
{
|
|
const sal_Unicode* pStr = aContChars.getStr();
|
|
const sal_Unicode* p = StrChr( pStr, c );
|
|
if ( p )
|
|
return pCont[ p - pStr ];
|
|
}
|
|
return ParserFlags::ILLEGAL;
|
|
}
|
|
|
|
|
|
void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType )
|
|
{
|
|
assert(r.LeadingWhiteSpace == 0);
|
|
ScanState eState = ssGetChar;
|
|
|
|
//! All the variables below (plus ParseResult) have to be reset on ssRewindFromValue!
|
|
OUStringBuffer aSymbol;
|
|
bool isFirst(true);
|
|
sal_Int32 index(nPos); // index of next code point after current
|
|
sal_Int32 postSymbolIndex(index); // index of code point following last quote
|
|
sal_uInt32 current((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0);
|
|
sal_uInt32 cLast = 0;
|
|
sal_Int32 nCodePoints(0);
|
|
int nDecSeps = 0;
|
|
bool bQuote = false;
|
|
bool bMightBeWord = true;
|
|
bool bMightBeWordLast = true;
|
|
bool bDecSepAltUsed = false;
|
|
//! All the variables above (plus ParseResult) have to be reset on ssRewindFromValue!
|
|
sal_Int32 nextCharIndex(nPos); // == index of nextChar
|
|
|
|
while ((current != 0) && (eState != ssStop))
|
|
{
|
|
++nCodePoints;
|
|
ParserFlags nMask = getFlags(current, eState);
|
|
if ( nMask & ParserFlags::EXCLUDED )
|
|
eState = ssBounce;
|
|
if ( bMightBeWord )
|
|
{ // only relevant for ssGetValue fall back
|
|
if ( eState == ssGetChar || eState == ssRewindFromValue ||
|
|
eState == ssIgnoreLeadingInRewind )
|
|
bMightBeWord = bool(nMask & ParserFlags::CHAR_WORD);
|
|
else
|
|
bMightBeWord = bool(nMask & ParserFlags::WORD);
|
|
}
|
|
sal_Int32 nParseTokensType = getParseTokensType(current, isFirst);
|
|
isFirst = false;
|
|
sal_Int32 const nextIndex(nextCharIndex); // == index of char following current
|
|
nextCharIndex = index; // == index of nextChar
|
|
sal_uInt32 nextChar((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0);
|
|
switch (eState)
|
|
{
|
|
case ssGetChar :
|
|
case ssRewindFromValue :
|
|
case ssIgnoreLeadingInRewind :
|
|
{
|
|
if ( (nMask & ParserFlags::CHAR_VALUE) && eState != ssRewindFromValue
|
|
&& eState != ssIgnoreLeadingInRewind )
|
|
{ //! must be first, may fall back to ssGetWord via bMightBeWord
|
|
eState = ssGetValue;
|
|
if ( nMask & ParserFlags::VALUE_DIGIT )
|
|
{
|
|
if (128 <= current)
|
|
r.TokenType = KParseType::UNI_NUMBER;
|
|
else
|
|
r.TokenType = KParseType::ASC_NUMBER;
|
|
}
|
|
else if (current == cDecimalSep || (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt)))
|
|
{
|
|
if (nextChar)
|
|
++nDecSeps;
|
|
else
|
|
eState = ssRewindFromValue;
|
|
// retry for ONE_SINGLE_CHAR or others
|
|
}
|
|
}
|
|
else if ( nMask & ParserFlags::CHAR_WORD )
|
|
{
|
|
eState = ssGetWord;
|
|
r.TokenType = KParseType::IDENTNAME;
|
|
}
|
|
else if ( nMask & ParserFlags::NAME_SEP )
|
|
{
|
|
eState = ssGetWordFirstChar;
|
|
bQuote = true;
|
|
postSymbolIndex = nextCharIndex;
|
|
nParseTokensType = 0; // will be taken of first real character
|
|
r.TokenType = KParseType::SINGLE_QUOTE_NAME;
|
|
}
|
|
else if ( nMask & ParserFlags::CHAR_STRING )
|
|
{
|
|
eState = ssGetString;
|
|
postSymbolIndex = nextCharIndex;
|
|
nParseTokensType = 0; // will be taken of first real character
|
|
r.TokenType = KParseType::DOUBLE_QUOTE_STRING;
|
|
}
|
|
else if ( nMask & ParserFlags::CHAR_DONTCARE )
|
|
{
|
|
if ( nStartTypes & KParseTokens::IGNORE_LEADING_WS )
|
|
{
|
|
if (eState == ssRewindFromValue)
|
|
eState = ssIgnoreLeadingInRewind;
|
|
r.LeadingWhiteSpace = nextCharIndex - nPos;
|
|
nCodePoints--; // exclude leading whitespace
|
|
postSymbolIndex = nextCharIndex;
|
|
nParseTokensType = 0; // wait until real character
|
|
bMightBeWord = true;
|
|
}
|
|
else
|
|
eState = ssBounce;
|
|
}
|
|
else if ( nMask & ParserFlags::CHAR_BOOL )
|
|
{
|
|
eState = ssGetBool;
|
|
r.TokenType = KParseType::BOOLEAN;
|
|
}
|
|
else if ( nMask & ParserFlags::CHAR )
|
|
{ //! must be last
|
|
eState = ssStop;
|
|
r.TokenType = KParseType::ONE_SINGLE_CHAR;
|
|
}
|
|
else
|
|
eState = ssBounce; // not known
|
|
}
|
|
break;
|
|
case ssGetValue :
|
|
{
|
|
if ( nMask & ParserFlags::VALUE_DIGIT )
|
|
{
|
|
if (128 <= current)
|
|
r.TokenType = KParseType::UNI_NUMBER;
|
|
else if ( r.TokenType != KParseType::UNI_NUMBER )
|
|
r.TokenType = KParseType::ASC_NUMBER;
|
|
}
|
|
if ( nMask & ParserFlags::VALUE )
|
|
{
|
|
if (current == cGroupSep)
|
|
{
|
|
// accept only if it is followed by 3 digits
|
|
sal_Int32 tempIndex(index);
|
|
sal_uInt32 const nextChar2((tempIndex < rText.getLength()) ? rText.iterateCodePoints(&tempIndex) : 0);
|
|
sal_uInt32 const nextChar3((tempIndex < rText.getLength()) ? rText.iterateCodePoints(&tempIndex) : 0);
|
|
if (getFlags(nextChar, eState) & ParserFlags::VALUE_DIGIT
|
|
&& getFlags(nextChar2, eState) & ParserFlags::VALUE_DIGIT
|
|
&& getFlags(nextChar3, eState) & ParserFlags::VALUE_DIGIT)
|
|
{
|
|
nParseTokensType |= KParseTokens::GROUP_SEPARATOR_IN_NUMBER;
|
|
}
|
|
else
|
|
{
|
|
// Trailing group separator character is not a
|
|
// group separator.
|
|
eState = ssStopBack;
|
|
}
|
|
}
|
|
else if ((current == cDecimalSep ||
|
|
(bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt))) &&
|
|
++nDecSeps > 1)
|
|
{
|
|
if (nCodePoints == 2)
|
|
eState = ssRewindFromValue;
|
|
// consecutive separators
|
|
else
|
|
eState = ssStopBack;
|
|
}
|
|
// else keep it going
|
|
}
|
|
else if (current == 'E' || current == 'e')
|
|
{
|
|
ParserFlags nNext = getFlags(nextChar, eState);
|
|
if ( nNext & ParserFlags::VALUE_EXP )
|
|
; // keep it going
|
|
else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar))
|
|
{ // might be a numerical name (1.2efg)
|
|
eState = ssGetWord;
|
|
r.TokenType = KParseType::IDENTNAME;
|
|
}
|
|
else
|
|
eState = ssStopBack;
|
|
}
|
|
else if ( nMask & ParserFlags::VALUE_SIGN )
|
|
{
|
|
if ( (cLast == 'E') || (cLast == 'e') )
|
|
{
|
|
ParserFlags nNext = getFlags(nextChar, eState);
|
|
if ( nNext & ParserFlags::VALUE_EXP_VALUE )
|
|
; // keep it going
|
|
else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar))
|
|
{ // might be a numerical name (1.2e+fg)
|
|
eState = ssGetWord;
|
|
r.TokenType = KParseType::IDENTNAME;
|
|
}
|
|
else
|
|
eState = ssStopBack;
|
|
}
|
|
else if ( bMightBeWord )
|
|
{ // might be a numerical name (1.2+fg)
|
|
eState = ssGetWord;
|
|
r.TokenType = KParseType::IDENTNAME;
|
|
}
|
|
else
|
|
eState = ssStopBack;
|
|
}
|
|
else if ( bMightBeWord && (nMask & ParserFlags::WORD) )
|
|
{ // might be a numerical name (1995.A1)
|
|
eState = ssGetWord;
|
|
r.TokenType = KParseType::IDENTNAME;
|
|
}
|
|
else
|
|
eState = ssStopBack;
|
|
}
|
|
break;
|
|
case ssGetWordFirstChar :
|
|
eState = ssGetWord;
|
|
[[fallthrough]];
|
|
case ssGetWord :
|
|
{
|
|
if ( nMask & ParserFlags::WORD )
|
|
; // keep it going
|
|
else if ( nMask & ParserFlags::NAME_SEP )
|
|
{
|
|
if ( bQuote )
|
|
{
|
|
if ( cLast == '\\' )
|
|
{ // escaped
|
|
aSymbol.append(
|
|
OUString::Concat(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2))
|
|
+ OUString(¤t, 1));
|
|
}
|
|
else
|
|
{
|
|
eState = ssStop;
|
|
aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
|
|
}
|
|
postSymbolIndex = nextCharIndex;
|
|
}
|
|
else
|
|
eState = ssStopBack;
|
|
}
|
|
else if ( bQuote )
|
|
; // keep it going
|
|
else
|
|
eState = ssStopBack;
|
|
}
|
|
break;
|
|
case ssGetString :
|
|
{
|
|
if ( nMask & ParserFlags::STRING_SEP )
|
|
{
|
|
if ( cLast == '\\' )
|
|
{ // escaped
|
|
aSymbol.append(
|
|
rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2)
|
|
+ OUString(¤t, 1));
|
|
}
|
|
else if (current == nextChar &&
|
|
!(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) )
|
|
{ // "" => literal " escaped
|
|
aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex));
|
|
nextCharIndex = index;
|
|
if (index < rText.getLength()) { ++nCodePoints; }
|
|
nextChar = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0;
|
|
}
|
|
else
|
|
{
|
|
eState = ssStop;
|
|
aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
|
|
}
|
|
postSymbolIndex = nextCharIndex;
|
|
}
|
|
}
|
|
break;
|
|
case ssGetBool :
|
|
{
|
|
if ( nMask & ParserFlags::BOOL )
|
|
eState = ssStop; // maximum 2: <, >, <>, <=, >=
|
|
else
|
|
eState = ssStopBack;
|
|
}
|
|
break;
|
|
case ssStopBack :
|
|
case ssBounce :
|
|
case ssStop :
|
|
; // nothing, no compiler warning
|
|
break;
|
|
}
|
|
if ( eState == ssRewindFromValue )
|
|
{
|
|
r = ParseResult();
|
|
index = nPos;
|
|
postSymbolIndex = nPos;
|
|
nextCharIndex = nPos;
|
|
aSymbol.setLength(0);
|
|
current = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0;
|
|
nCodePoints = (nPos < rText.getLength()) ? 1 : 0;
|
|
isFirst = true;
|
|
cLast = 0;
|
|
nDecSeps = 0;
|
|
bQuote = false;
|
|
bMightBeWord = true;
|
|
bMightBeWordLast = true;
|
|
bDecSepAltUsed = false;
|
|
}
|
|
else
|
|
{
|
|
if ( !(r.TokenType & nTokenType) )
|
|
{
|
|
if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER))
|
|
&& (nTokenType & KParseType::IDENTNAME) && bMightBeWord )
|
|
; // keep a number that might be a word
|
|
else if (r.LeadingWhiteSpace == (nextCharIndex - nPos))
|
|
; // keep ignored white space
|
|
else if ( !r.TokenType && eState == ssGetValue && (nMask & ParserFlags::VALUE_SEP) )
|
|
; // keep uncertain value
|
|
else
|
|
eState = ssBounce;
|
|
}
|
|
if ( eState == ssBounce )
|
|
{
|
|
r.TokenType = 0;
|
|
eState = ssStopBack;
|
|
}
|
|
if ( eState == ssStopBack )
|
|
{ // put back
|
|
nextChar = rText.iterateCodePoints(&index, -1);
|
|
nextCharIndex = nextIndex;
|
|
--nCodePoints;
|
|
bMightBeWord = bMightBeWordLast;
|
|
eState = ssStop;
|
|
}
|
|
if ( eState != ssStop )
|
|
{
|
|
if ( !r.StartFlags )
|
|
r.StartFlags |= nParseTokensType;
|
|
else
|
|
r.ContFlags |= nParseTokensType;
|
|
}
|
|
bMightBeWordLast = bMightBeWord;
|
|
cLast = current;
|
|
current = nextChar;
|
|
}
|
|
}
|
|
// r.CharLen is the length in characters (not code units) of the parsed
|
|
// token not including any leading white space.
|
|
r.CharLen = nCodePoints;
|
|
r.EndPos = nextCharIndex;
|
|
if ( r.TokenType & KParseType::ASC_NUMBER )
|
|
{
|
|
r.Value = rtl_math_uStringToDouble(rText.getStr() + nPos + r.LeadingWhiteSpace,
|
|
rText.getStr() + r.EndPos, (bDecSepAltUsed ? cDecimalSepAlt : cDecimalSep), cGroupSep, nullptr, nullptr);
|
|
if ( bMightBeWord )
|
|
r.TokenType |= KParseType::IDENTNAME;
|
|
}
|
|
else if ( r.TokenType & KParseType::UNI_NUMBER )
|
|
{
|
|
if ( !xNatNumSup.is() )
|
|
{
|
|
if ( m_xContext.is() )
|
|
{
|
|
xNatNumSup = NativeNumberSupplier::create( m_xContext );
|
|
}
|
|
}
|
|
OUString aTmp(rText.getStr() + nPos + r.LeadingWhiteSpace,
|
|
r.EndPos - nPos - r.LeadingWhiteSpace);
|
|
// transliterate to ASCII
|
|
aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale,
|
|
NativeNumberMode::NATNUM0 );
|
|
r.Value = ::rtl::math::stringToDouble( aTmp, cDecimalSep, cGroupSep );
|
|
if ( bMightBeWord )
|
|
r.TokenType |= KParseType::IDENTNAME;
|
|
}
|
|
else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) )
|
|
{
|
|
if (postSymbolIndex < nextCharIndex)
|
|
{ //! open quote
|
|
aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
|
|
r.TokenType |= KParseType::MISSING_QUOTE;
|
|
}
|
|
r.DequotedNameOrString = aSymbol.makeStringAndClear();
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|