office-gobmx/svtools/source/svrtf/parrtf.cxx

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */

#include <sal/config.h>
#include <sal/log.hxx>

#include <comphelper/scopeguard.hxx>

#include <rtl/character.hxx>
#include <rtl/strbuf.hxx>
#include <rtl/tencinfo.h>
#include <rtl/ustrbuf.hxx>
#include <tools/stream.hxx>
#include <tools/debug.hxx>
#include <svtools/rtftoken.h>
#include <svtools/parrtf.hxx>

const int MAX_STRING_LEN = 1024;
const int MAX_TOKEN_LEN = 128;

#define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
#define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)

SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
    : SvParser<int>( rIn, nStackSize )
    , nOpenBrackets(0)
    , eCodeSet(RTL_TEXTENCODING_MS_1252)
    , nUCharOverread(1)
{
    // default is ANSI-CodeSet
    SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
    bRTF_InTextRead = false;
}

SvRTFParser::~SvRTFParser()
{
}


int SvRTFParser::GetNextToken_()
{
    int nRet = 0;
    do {
        bool bNextCh = true;
        switch( nNextCh )
        {
        case '\\':
            {
                // control characters
                nNextCh = GetNextChar();
                switch( nNextCh )
                {
                case '{':
                case '}':
                case '\\':
                case '+':       // I found it in a RTF-file
                case '~':       // nonbreaking space
                case '-':       // optional hyphen
                case '_':       // nonbreaking hyphen
                case '\'':      // HexValue
                    nNextCh = '\\';
                    rInput.SeekRel( -1 );
                    ScanText();
                    nRet = RTF_TEXTTOKEN;
                    bNextCh = 0 == nNextCh;
                    break;

                case '*':       // ignoreflag
                    nRet = RTF_IGNOREFLAG;
                    break;
                case ':':       // subentry in an index entry
                    nRet = RTF_SUBENTRYINDEX;
                    break;
                case '|':       // formula-character
                    nRet = RTF_FORMULA;
                    break;

                case 0x0a:
                case 0x0d:
                    nRet = RTF_PAR;
                    break;

                default:
                    if( RTF_ISALPHA( nNextCh ) )
                    {
                        aToken = "\\";
                        {
                            OUStringBuffer aStrBuffer( MAX_TOKEN_LEN );
                            do {
                                aStrBuffer.appendUtf32(nNextCh);
                                nNextCh = GetNextChar();
                            } while( RTF_ISALPHA( nNextCh ) );
                            aToken += aStrBuffer;
                        }

                        // minus before numeric parameters
                        bool bNegValue = false;
                        if( '-' == nNextCh )
                        {
                            bNegValue = true;
                            nNextCh = GetNextChar();
                        }

                        // possible numeric parameter
                        if( RTF_ISDIGIT( nNextCh ) )
                        {
                            OUStringBuffer aNumber;
                            do {
                                aNumber.append(static_cast<sal_Unicode>(nNextCh));
                                nNextCh = GetNextChar();
                            } while( RTF_ISDIGIT( nNextCh ) );
                            nTokenValue = OUString::unacquired(aNumber).toInt32();
                            if( bNegValue )
                                nTokenValue = -nTokenValue;
                            bTokenHasValue=true;
                        }
                        else if( bNegValue )        // restore minus
                        {
                            nNextCh = '-';
                            rInput.SeekRel( -1 );
                        }
                        if( ' ' == nNextCh )        // blank is part of token!
                            nNextCh = GetNextChar();

                        // search for the token in the table:
                        if( 0 == (nRet = GetRTFToken( aToken )) )
                            // Unknown Control
                            nRet = RTF_UNKNOWNCONTROL;

                        // bug 76812 - unicode token handled as normal text
                        bNextCh = false;
                        switch( nRet )
                        {
                        case RTF_UC:
                            if( 0 <= nTokenValue )
                            {
                                nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
                                if (!aParserStates.empty())
                                {
                                    //cmc: other ifdef breaks #i3584
                                    aParserStates.top().nUCharOverread = nUCharOverread;
                                }
                            }
                            aToken.clear(); // #i47831# erase token to prevent the token from being treated as text
                            // read next token
                            nRet = 0;
                            break;

                        case RTF_UPR:
                            if (!_inSkipGroup) {
                            // UPR - overread the group with the ansi
                            //       information
                            int nNextToken;
                            do
                            {
                                nNextToken = GetNextToken_();
                            }
                            while (nNextToken != '{' && nNextToken != sal_Unicode(EOF));

                            SkipGroup();
                            GetNextToken_();  // overread the last bracket
                            nRet = 0;
                            }
                            break;

                        case RTF_U:
                            if( !bRTF_InTextRead )
                            {
                                nRet = RTF_TEXTTOKEN;
                                aToken = OUString( static_cast<sal_Unicode>(nTokenValue) );

                                // overread the next n "RTF" characters. This
                                // can be also \{, \}, \'88
                                for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
                                {
                                    sal_uInt32 cAnsi = nNextCh;
                                    while( 0xD == cAnsi )
                                        cAnsi = GetNextChar();
                                    while( 0xA == cAnsi )
                                        cAnsi = GetNextChar();

                                    if( '\\' == cAnsi &&
                                        '\'' == GetNextChar() )
                                        // skip HexValue
                                        GetHexValue();
                                    nNextCh = GetNextChar();
                                }
                                ScanText();
                                bNextCh = 0 == nNextCh;
                            }
                            break;
                        }
                    }
                    else if( SvParserState::Pending != eState )
                    {
                        // Bug 34631 - "\ " read on - Blank as character
                        // eState = SvParserState::Error;
                        bNextCh = false;
                    }
                    break;
                }
            }
            break;

        case sal_Unicode(EOF):
            eState = SvParserState::Accepted;
            nRet = nNextCh;
            break;

        case '{':
            {
                if( 0 <= nOpenBrackets )
                {
                    RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
                    aParserStates.push( aState );
                }
                ++nOpenBrackets;
                DBG_ASSERT(
                    static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
                    "ParserStateStack unequal to bracket count" );
                nRet = nNextCh;
            }
            break;

        case '}':
            --nOpenBrackets;
            if( 0 <= nOpenBrackets )
            {
                aParserStates.pop();
                if( !aParserStates.empty() )
                {
                    const RtfParserState_Impl& rRPS =
                            aParserStates.top();
                    nUCharOverread = rRPS.nUCharOverread;
                    SetSrcEncoding( rRPS.eCodeSet );
                }
                else
                {
                    nUCharOverread = 1;
                    SetSrcEncoding( GetCodeSet() );
                }
            }
            DBG_ASSERT(
                static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
                "ParserStateStack unequal to bracket count" );
            nRet = nNextCh;
            break;

        case 0x0d:
        case 0x0a:
            break;

        default:
            // now normal text follows
            ScanText();
            nRet = RTF_TEXTTOKEN;
            bNextCh = 0 == nNextCh;
            break;
        }

        if( bNextCh )
            nNextCh = GetNextChar();

    } while( !nRet && SvParserState::Working == eState );
    return nRet;
}


sal_Unicode SvRTFParser::GetHexValue()
{
    // collect Hex values
    int n;
    sal_Unicode nHexVal = 0;

    for( n = 0; n < 2; ++n )
    {
        nHexVal *= 16;
        nNextCh = GetNextChar();
        if( nNextCh >= '0' && nNextCh <= '9' )
            nHexVal += (nNextCh - 48);
        else if( nNextCh >= 'a' && nNextCh <= 'f' )
            nHexVal += (nNextCh - 87);
        else if( nNextCh >= 'A' && nNextCh <= 'F' )
            nHexVal += (nNextCh - 55);
    }
    return nHexVal;
}

void SvRTFParser::ScanText()
{
    const sal_Unicode cBreak = 0;
    OUStringBuffer aStrBuffer;
    bool bContinue = true;
    while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
    {
        bool bNextCh = true;
        switch( nNextCh )
        {
        case '\\':
            {
                nNextCh = GetNextChar();
                switch (nNextCh)
                {
                case '\'':
                    {

                        OStringBuffer aByteString;
                        while (true)
                        {
                            char c = static_cast<char>(GetHexValue());
                            /*
                             * Note: \'00 is a valid internal character in  a
                             * string in RTF. OStringBuffer supports
                             * appending nulls fine
                             */
                            aByteString.append(c);

                            bool bBreak = false;
                            bool bEOF = false;
                            char nSlash = '\\';
                            while (!bBreak)
                            {
                                auto next = GetNextChar();
                                if (sal_Unicode(EOF) == next)
                                {
                                    bEOF = true;
                                    break;
                                }
                                if (next>0xFF) // fix for #i43933# and #i35653#
                                {
                                    if (!aByteString.isEmpty())
                                        aStrBuffer.append( OStringToOUString(aByteString.makeStringAndClear(), GetSrcEncoding()) );
                                    aStrBuffer.append(static_cast<sal_Unicode>(next));

                                    continue;
                                }
                                nSlash = static_cast<char>(next);
                                while (nSlash == 0xD || nSlash == 0xA)
                                    nSlash = static_cast<char>(GetNextChar());

                                switch (nSlash)
                                {
                                    case '{':
                                    case '}':
                                    case '\\':
                                        bBreak = true;
                                        break;
                                    default:
                                        aByteString.append(nSlash);
                                        break;
                                }
                            }

                            if (bEOF)
                            {
                                bContinue = false;        // abort, string together
                                break;
                            }

                            nNextCh = GetNextChar();

                            if (nSlash != '\\' || nNextCh != '\'')
                            {
                                rInput.SeekRel(-1);
                                nNextCh = static_cast<unsigned char>(nSlash);
                                break;
                            }
                        }

                        bNextCh = false;

                        if (!aByteString.isEmpty())
                            aStrBuffer.append( OStringToOUString(aByteString.makeStringAndClear(), GetSrcEncoding()) );
                    }
                    break;
                case '\\':
                case '}':
                case '{':
                case '+':       // I found in a RTF file
                    aStrBuffer.append(sal_Unicode(nNextCh));
                    break;
                case '~':       // nonbreaking space
                    aStrBuffer.append(u'\x00A0');
                    break;
                case '-':       // optional hyphen
                    aStrBuffer.append(u'\x00AD');
                    break;
                case '_':       // nonbreaking hyphen
                    aStrBuffer.append(u'\x2011');
                    break;

                case 'u':
                    // read UNI-Code characters
                    {
                        nNextCh = GetNextChar();
                        rInput.SeekRel( -2 );

                        if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
                        {
                            bRTF_InTextRead = true;

                            OUString sSave( aToken );
                            nNextCh = '\\';
                            int nToken = GetNextToken_();
                            DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
                            // don't convert symbol chars
                            aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));

                            // overread the next n "RTF" characters. This
                            // can be also \{, \}, \'88
                            for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
                            {
                                sal_Unicode cAnsi = nNextCh;
                                while( 0xD == cAnsi )
                                    cAnsi = GetNextChar();
                                while( 0xA == cAnsi )
                                    cAnsi = GetNextChar();

                                if( '\\' == cAnsi &&
                                    '\'' == GetNextChar() )
                                    // skip HexValue
                                    GetHexValue();
                                nNextCh = GetNextChar();
                            }
                            bNextCh = false;
                            aToken = sSave;
                            bRTF_InTextRead = false;
                        }
                        else if ( 'c' == nNextCh )
                        {
                            // Prevent text breaking into multiple tokens.
                            rInput.SeekRel( 2 );
                            nNextCh = GetNextChar();
                            if (RTF_ISDIGIT( nNextCh ))
                            {
                                sal_uInt8 nNewOverread = 0 ;
                                do {
                                    nNewOverread *= 10;
                                    nNewOverread += nNextCh - '0';
                                    nNextCh = GetNextChar();
                                } while ( RTF_ISDIGIT( nNextCh ) );
                                nUCharOverread = nNewOverread;
                                if (!aParserStates.empty())
                                    aParserStates.top().nUCharOverread = nNewOverread;
                            }
                            bNextCh = 0x20 == nNextCh;
                        }
                        else
                        {
                            nNextCh = '\\';
                            bContinue = false;        // abort, string together
                        }
                    }
                    break;

                default:
                    rInput.SeekRel( -1 );
                    nNextCh = '\\';
                    bContinue = false;        // abort, string together
                    break;
                }
            }
            break;

        case sal_Unicode(EOF):
            eState = SvParserState::Error;
            [[fallthrough]];
        case '{':
        case '}':
            bContinue = false;
            break;

        case 0x0a:
        case 0x0d:
            break;

        default:
            if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
                bContinue = false;
            else
            {
                do {
                    // all other characters end up in the text
                    aStrBuffer.appendUtf32(nNextCh);

                    if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
                    {
                        if (!aStrBuffer.isEmpty())
                            aToken += aStrBuffer;
                        return;
                    }
                } while
                (
                    (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
                    (aStrBuffer.getLength() < MAX_STRING_LEN)
                );
                bNextCh = false;
            }
        }

        if( bContinue && bNextCh )
            nNextCh = GetNextChar();
    }

    if (!aStrBuffer.isEmpty())
        aToken += aStrBuffer;
}


short SvRTFParser::_inSkipGroup=0;

void SvRTFParser::SkipGroup()
{
    short nBrackets=1;
    if (_inSkipGroup>0)
        return;
    _inSkipGroup++;
//#i16185# faking \bin keyword
    do
    {
        switch (nNextCh)
        {
            case '{':
                ++nBrackets;
                break;
            case '}':
                if (!--nBrackets) {
                    _inSkipGroup--;
                    return;
                }
                break;
        }
        int nToken = GetNextToken_();
        if (nToken == RTF_BIN)
        {
            rInput.SeekRel(-1);
            SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
            if (nTokenValue > 0)
                rInput.SeekRel(nTokenValue);
            nNextCh = GetNextChar();
        }
        while (nNextCh==0xa || nNextCh==0xd)
        {
            nNextCh = GetNextChar();
        }
    } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());

    if( SvParserState::Pending != eState && '}' != nNextCh )
        eState = SvParserState::Error;
    _inSkipGroup--;
}

void SvRTFParser::ReadUnknownData() { SkipGroup(); }
void SvRTFParser::ReadBitmapData()  { SkipGroup(); }


SvParserState SvRTFParser::CallParser()
{
    char cFirstCh(0);
    nNextChPos = rInput.Tell();
    rInput.ReadChar( cFirstCh );
    nNextCh = static_cast<unsigned char>(cFirstCh);
    eState = SvParserState::Working;
    nOpenBrackets = 0;
    eCodeSet = RTL_TEXTENCODING_MS_1252;
    SetSrcEncoding( eCodeSet );

    // the first two tokens should be '{' and \\rtf !!
    if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
    {
        AddFirstRef();
        // call ReleaseRef at end of this scope, even in the face of exceptions
        comphelper::ScopeGuard g([this] {
            if( SvParserState::Pending != eState )
                ReleaseRef();       // now parser is not needed anymore
        });
        Continue( 0 );
    }
    else
        eState = SvParserState::Error;

    return eState;
}

void SvRTFParser::Continue( int nToken )
{
//  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
//              "Characterset was changed." );

    if( !nToken )
        nToken = GetNextToken();

    bool bLooping = false;

    while (IsParserWorking() && !bLooping)
    {
        auto nCurrentTokenIndex = m_nTokenIndex;
        auto nCurrentToken = nToken;

        SaveState( nToken );
        switch( nToken )
        {
        case '}':
            if( nOpenBrackets )
                goto NEXTTOKEN;
            eState = SvParserState::Accepted;
            break;

        case '{':
            // an unknown group ?
            {
                if( RTF_IGNOREFLAG != GetNextToken() )
                    nToken = SkipToken();
                else if( RTF_UNKNOWNCONTROL != GetNextToken() )
                    nToken = SkipToken( -2 );
                else
                {
                    // filter immediately
                    ReadUnknownData();
                    nToken = GetNextToken();
                    if( '}' != nToken )
                        eState = SvParserState::Error;
                    break;      // move to next token!!
                }
            }
            goto NEXTTOKEN;

        case RTF_UNKNOWNCONTROL:
            break;      // skip unknown token
        case RTF_NEXTTYPE:
        case RTF_ANSITYPE:
            eCodeSet = RTL_TEXTENCODING_MS_1252;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_MACTYPE:
            eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_PCTYPE:
            eCodeSet = RTL_TEXTENCODING_IBM_437;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_PCATYPE:
            eCodeSet = RTL_TEXTENCODING_IBM_850;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_ANSICPG:
            eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
            SetSrcEncoding(eCodeSet);
            break;
        default:
NEXTTOKEN:
            NextToken( nToken );
            break;
        }
        if( IsParserWorking() )
            SaveState( 0 );         // processed till here,
                                    // continue with new token!
        nToken = GetNextToken();
        bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
    }
    if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
        eState = SvParserState::Error;
}

void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
{
    if (eEnc == RTL_TEXTENCODING_DONTKNOW)
        eEnc = GetCodeSet();

    if (!aParserStates.empty())
        aParserStates.top().eCodeSet = eEnc;
    SetSrcEncoding(eEnc);
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */