567 lines
18 KiB
C++
567 lines
18 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* This file incorporates work covered by the following license notice:
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
*/
|
|
#include <string.h>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <sal/types.h>
|
|
|
|
#include <rtl/textenc.h>
|
|
#include <rtl/tencinfo.h>
|
|
|
|
#include <com/sun/star/io/XInputStream.hpp>
|
|
|
|
using namespace ::com::sun::star::uno;
|
|
using namespace ::com::sun::star::io;
|
|
|
|
using ::rtl::OString;
|
|
|
|
#include "xml2utf.hxx"
|
|
|
|
namespace sax_expatwrap {
|
|
|
|
sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
|
|
throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
|
|
{
|
|
|
|
Sequence<sal_Int8> seqIn;
|
|
|
|
if( ! m_in.is() ) {
|
|
throw NotConnectedException();
|
|
}
|
|
if( ! m_bStarted ) {
|
|
// it should be possible to find the encoding attribute
|
|
// within the first 512 bytes == 128 chars in UCS-4
|
|
nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
|
|
}
|
|
|
|
sal_Int32 nRead;
|
|
Sequence< sal_Int8 > seqStart;
|
|
while( sal_True )
|
|
{
|
|
nRead = m_in->readSomeBytes( seq , nMaxToRead );
|
|
|
|
if( nRead + seqStart.getLength())
|
|
{
|
|
// if nRead is 0, the file is already eof.
|
|
if( ! m_bStarted && nRead )
|
|
{
|
|
// ensure that enough data is available to parse encoding
|
|
if( seqStart.getLength() )
|
|
{
|
|
// prefix with what we had so far.
|
|
sal_Int32 nLength = seq.getLength();
|
|
seq.realloc( seqStart.getLength() + nLength );
|
|
|
|
memmove (seq.getArray() + seqStart.getLength(),
|
|
seq.getConstArray(),
|
|
nLength);
|
|
memcpy (seq.getArray(),
|
|
seqStart.getConstArray(),
|
|
seqStart.getLength());
|
|
}
|
|
|
|
// autodetection with the first bytes
|
|
if( ! isEncodingRecognizable( seq ) )
|
|
{
|
|
// remember what we have so far.
|
|
seqStart = seq;
|
|
|
|
// read more !
|
|
continue;
|
|
}
|
|
if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
|
|
// initialize decoding
|
|
initializeDecoding();
|
|
}
|
|
nRead = seq.getLength();
|
|
seqStart = Sequence < sal_Int8 > ();
|
|
}
|
|
|
|
// do the encoding
|
|
if( m_pText2Unicode && m_pUnicode2Text &&
|
|
m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
|
|
|
|
Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
|
|
seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
|
|
}
|
|
|
|
if( ! m_bStarted )
|
|
{
|
|
// it must now be ensured, that no encoding attribute exist anymore
|
|
// ( otherwise the expat-Parser will crash )
|
|
// This must be done after decoding !
|
|
// ( e.g. Files decoded in ucs-4 cannot be read properly )
|
|
m_bStarted = sal_True;
|
|
removeEncoding( seq );
|
|
}
|
|
nRead = seq.getLength();
|
|
}
|
|
|
|
break;
|
|
}
|
|
return nRead;
|
|
}
|
|
|
|
|
|
XMLFile2UTFConverter::~XMLFile2UTFConverter()
|
|
{
|
|
if( m_pText2Unicode )
|
|
delete m_pText2Unicode;
|
|
if( m_pUnicode2Text )
|
|
delete m_pUnicode2Text;
|
|
}
|
|
|
|
|
|
void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
|
|
{
|
|
const sal_Int8 *pSource = seq.getArray();
|
|
if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
|
|
{
|
|
|
|
// scan for encoding
|
|
OString str( (sal_Char * ) pSource , seq.getLength() );
|
|
|
|
// cut sequence to first line break
|
|
// find first line break;
|
|
int nMax = str.indexOf( 10 );
|
|
if( nMax >= 0 )
|
|
{
|
|
str = str.copy( 0 , nMax );
|
|
}
|
|
|
|
int nFound = str.indexOf( " encoding" );
|
|
if( nFound >= 0 ) {
|
|
int nStop;
|
|
int nStart = str.indexOf( "\"" , nFound );
|
|
if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
|
|
{
|
|
nStart = str.indexOf( "'" , nFound );
|
|
nStop = str.indexOf( "'" , nStart +1 );
|
|
}
|
|
else
|
|
{
|
|
nStop = str.indexOf( "\"" , nStart +1);
|
|
}
|
|
|
|
if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
|
|
{
|
|
// remove encoding tag from file
|
|
memmove( &( seq.getArray()[nFound] ) ,
|
|
&( seq.getArray()[nStop+1]) ,
|
|
seq.getLength() - nStop -1);
|
|
seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
|
|
// str = String( (char * ) seq.getArray() , seq.getLen() );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Checks, if enough data has been accumulated to recognize the encoding
|
|
sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
|
|
{
|
|
const sal_Int8 *pSource = seq.getConstArray();
|
|
sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
|
|
|
|
if( seq.getLength() < 8 ) {
|
|
// no recognition possible, when less than 8 bytes are available
|
|
return sal_False;
|
|
}
|
|
|
|
if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
|
|
// scan if the <?xml tag finishes within this buffer
|
|
bCheckIfFirstClosingBracketExsists = sal_True;
|
|
}
|
|
else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
|
|
( ('?' == pSource[4] || '?' == pSource[6] ) ) )
|
|
{
|
|
// check for utf-16
|
|
bCheckIfFirstClosingBracketExsists = sal_True;
|
|
}
|
|
else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
|
|
( '?' == pSource[5] || '?' == pSource[7] ) )
|
|
{
|
|
// check for
|
|
bCheckIfFirstClosingBracketExsists = sal_True;
|
|
}
|
|
|
|
if( bCheckIfFirstClosingBracketExsists )
|
|
{
|
|
for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
|
|
{
|
|
// whole <?xml tag is valid
|
|
if( '>' == pSource[ i ] )
|
|
{
|
|
return sal_True;
|
|
}
|
|
}
|
|
return sal_False;
|
|
}
|
|
|
|
// No <? tag in front, no need for a bigger buffer
|
|
return sal_True;
|
|
}
|
|
|
|
sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
|
|
{
|
|
const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
|
|
sal_Bool bReturn = sal_True;
|
|
|
|
if( seq.getLength() < 4 ) {
|
|
// no recognition possible, when less than 4 bytes are available
|
|
return sal_False;
|
|
}
|
|
|
|
// first level : detect possible file formats
|
|
if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
|
|
|
|
// scan for encoding
|
|
OString str( (const sal_Char *) pSource , seq.getLength() );
|
|
|
|
// cut sequence to first line break
|
|
//find first line break;
|
|
int nMax = str.indexOf( 10 );
|
|
if( nMax >= 0 )
|
|
{
|
|
str = str.copy( 0 , nMax );
|
|
}
|
|
|
|
int nFound = str.indexOf( " encoding" );
|
|
if( nFound < str.getLength() ) {
|
|
int nStop;
|
|
int nStart = str.indexOf( "\"" , nFound );
|
|
if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
|
|
{
|
|
nStart = str.indexOf( "'" , nFound );
|
|
nStop = str.indexOf( "'" , nStart +1 );
|
|
}
|
|
else
|
|
{
|
|
nStop = str.indexOf( "\"" , nStart +1);
|
|
}
|
|
if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
|
|
{
|
|
// encoding found finally
|
|
m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
|
|
}
|
|
}
|
|
}
|
|
else if( 0xFE == pSource[0] &&
|
|
0xFF == pSource[1] ) {
|
|
// UTF-16 big endian
|
|
// conversion is done so that encoding information can be easily extracted
|
|
m_sEncoding = "utf-16";
|
|
}
|
|
else if( 0xFF == pSource[0] &&
|
|
0xFE == pSource[1] ) {
|
|
// UTF-16 little endian
|
|
// conversion is done so that encoding information can be easily extracted
|
|
m_sEncoding = "utf-16";
|
|
}
|
|
else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
|
|
// UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
|
|
// The byte order mark is simply added
|
|
|
|
// simply add the byte order mark !
|
|
seq.realloc( seq.getLength() + 2 );
|
|
memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
|
|
((sal_uInt8*)seq.getArray())[0] = 0xFE;
|
|
((sal_uInt8*)seq.getArray())[1] = 0xFF;
|
|
|
|
m_sEncoding = "utf-16";
|
|
}
|
|
else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
|
|
// UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
|
|
// The byte order mark is simply added
|
|
|
|
seq.realloc( seq.getLength() + 2 );
|
|
memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
|
|
((sal_uInt8*)seq.getArray())[0] = 0xFF;
|
|
((sal_uInt8*)seq.getArray())[1] = 0xFE;
|
|
|
|
m_sEncoding = "utf-16";
|
|
}
|
|
else if( 0xEF == pSource[0] &&
|
|
0xBB == pSource[1] &&
|
|
0xBF == pSource[2] )
|
|
{
|
|
// UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
|
|
// The BOM is removed.
|
|
memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
|
|
seq.realloc( seq.getLength() - 3 );
|
|
m_sEncoding = "utf-8";
|
|
}
|
|
else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
|
|
// UCS-4 big endian
|
|
m_sEncoding = "ucs-4";
|
|
}
|
|
else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
|
|
// UCS-4 little endian
|
|
m_sEncoding = "ucs-4";
|
|
}
|
|
else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
|
|
0xa7 == static_cast<unsigned char> (pSource[2]) &&
|
|
0x94 == static_cast<unsigned char> (pSource[3]) ) {
|
|
// EBCDIC
|
|
bReturn = sal_False; // must be extended
|
|
}
|
|
else {
|
|
// other
|
|
// UTF8 is directly recognized by the parser.
|
|
bReturn = sal_False;
|
|
}
|
|
|
|
return bReturn;
|
|
}
|
|
|
|
void XMLFile2UTFConverter::initializeDecoding()
|
|
{
|
|
|
|
if( !m_sEncoding.isEmpty() )
|
|
{
|
|
rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
|
|
if( encoding != RTL_TEXTENCODING_UTF8 )
|
|
{
|
|
m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
|
|
m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//----------------------------------------------
|
|
//
|
|
// Text2UnicodeConverter
|
|
//
|
|
//----------------------------------------------
|
|
Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
|
|
{
|
|
rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
|
|
if( RTL_TEXTENCODING_DONTKNOW == encoding )
|
|
{
|
|
m_bCanContinue = sal_False;
|
|
m_bInitialized = sal_False;
|
|
}
|
|
else
|
|
{
|
|
init( encoding );
|
|
}
|
|
}
|
|
|
|
Text2UnicodeConverter::~Text2UnicodeConverter()
|
|
{
|
|
if( m_bInitialized )
|
|
{
|
|
rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
|
|
rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
|
|
}
|
|
}
|
|
|
|
void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
|
|
{
|
|
m_bCanContinue = sal_True;
|
|
m_bInitialized = sal_True;
|
|
|
|
m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
|
|
m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
|
|
m_rtlEncoding = encoding;
|
|
}
|
|
|
|
|
|
Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
|
|
{
|
|
sal_uInt32 uiInfo;
|
|
sal_Size nSrcCvtBytes = 0;
|
|
sal_Size nTargetCount = 0;
|
|
sal_Size nSourceCount = 0;
|
|
|
|
// the whole source size
|
|
sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
|
|
Sequence<sal_Unicode> seqUnicode ( nSourceSize );
|
|
|
|
const sal_Int8 *pbSource = seqText.getConstArray();
|
|
sal_Int8 *pbTempMem = 0;
|
|
|
|
if( m_seqSource.getLength() ) {
|
|
// put old rest and new byte sequence into one array
|
|
pbTempMem = new sal_Int8[ nSourceSize ];
|
|
memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
|
|
memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
|
|
pbSource = pbTempMem;
|
|
|
|
// set to zero again
|
|
m_seqSource = Sequence< sal_Int8 >();
|
|
}
|
|
|
|
while( sal_True ) {
|
|
|
|
/* All invalid characters are transformed to the unicode undefined char */
|
|
nTargetCount += rtl_convertTextToUnicode(
|
|
m_convText2Unicode,
|
|
m_contextText2Unicode,
|
|
( const sal_Char * ) &( pbSource[nSourceCount] ),
|
|
nSourceSize - nSourceCount ,
|
|
&( seqUnicode.getArray()[ nTargetCount ] ),
|
|
seqUnicode.getLength() - nTargetCount,
|
|
RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
|
|
RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
|
|
RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
|
|
&uiInfo,
|
|
&nSrcCvtBytes );
|
|
nSourceCount += nSrcCvtBytes;
|
|
|
|
if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
|
|
// save necessary bytes for next conversion
|
|
seqUnicode.realloc( seqUnicode.getLength() * 2 );
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
|
|
m_seqSource.realloc( nSourceSize - nSourceCount );
|
|
memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
|
|
}
|
|
|
|
|
|
if( pbTempMem ) {
|
|
delete [] pbTempMem;
|
|
}
|
|
|
|
// set to correct unicode size
|
|
seqUnicode.realloc( nTargetCount );
|
|
|
|
return seqUnicode;
|
|
}
|
|
|
|
|
|
|
|
//----------------------------------------------
|
|
//
|
|
// Unicode2TextConverter
|
|
//
|
|
//----------------------------------------------
|
|
Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
|
|
{
|
|
init( encoding );
|
|
}
|
|
|
|
|
|
Unicode2TextConverter::~Unicode2TextConverter()
|
|
{
|
|
if( m_bInitialized ) {
|
|
rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
|
|
rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
|
|
}
|
|
}
|
|
|
|
|
|
Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
|
|
{
|
|
sal_Unicode *puTempMem = 0;
|
|
|
|
if( m_seqSource.getLength() ) {
|
|
// For surrogates !
|
|
// put old rest and new byte sequence into one array
|
|
// In general when surrogates are used, they should be rarely
|
|
// cut off between two convert()-calls. So this code is used
|
|
// rarely and the extra copy is acceptable.
|
|
puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
|
|
memcpy( puTempMem ,
|
|
m_seqSource.getConstArray() ,
|
|
m_seqSource.getLength() * sizeof( sal_Unicode ) );
|
|
memcpy(
|
|
&(puTempMem[ m_seqSource.getLength() ]) ,
|
|
puSource ,
|
|
nSourceSize*sizeof( sal_Unicode ) );
|
|
puSource = puTempMem;
|
|
nSourceSize += m_seqSource.getLength();
|
|
|
|
m_seqSource = Sequence< sal_Unicode > ();
|
|
}
|
|
|
|
|
|
sal_Size nTargetCount = 0;
|
|
sal_Size nSourceCount = 0;
|
|
|
|
sal_uInt32 uiInfo;
|
|
sal_Size nSrcCvtChars;
|
|
|
|
// take nSourceSize * 3 as preference
|
|
// this is an upper boundary for converting to utf8,
|
|
// which most often used as the target.
|
|
sal_Int32 nSeqSize = nSourceSize * 3;
|
|
|
|
Sequence<sal_Int8> seqText( nSeqSize );
|
|
sal_Char *pTarget = (sal_Char *) seqText.getArray();
|
|
while( sal_True ) {
|
|
|
|
nTargetCount += rtl_convertUnicodeToText(
|
|
m_convUnicode2Text,
|
|
m_contextUnicode2Text,
|
|
&( puSource[nSourceCount] ),
|
|
nSourceSize - nSourceCount ,
|
|
&( pTarget[nTargetCount] ),
|
|
nSeqSize - nTargetCount,
|
|
RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
|
|
RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
|
|
&uiInfo,
|
|
&nSrcCvtChars);
|
|
nSourceCount += nSrcCvtChars;
|
|
|
|
if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
|
|
nSeqSize = nSeqSize *2;
|
|
seqText.realloc( nSeqSize ); // double array size
|
|
pTarget = ( sal_Char * ) seqText.getArray();
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// for surrogates
|
|
if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
|
|
m_seqSource.realloc( nSourceSize - nSourceCount );
|
|
memcpy( m_seqSource.getArray() ,
|
|
&(puSource[nSourceCount]),
|
|
(nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
|
|
}
|
|
|
|
if( puTempMem ) {
|
|
delete [] puTempMem;
|
|
}
|
|
|
|
// reduce the size of the buffer (fast, no copy necessary)
|
|
seqText.realloc( nTargetCount );
|
|
|
|
return seqText;
|
|
}
|
|
|
|
void Unicode2TextConverter::init( rtl_TextEncoding encoding )
|
|
{
|
|
m_bCanContinue = sal_True;
|
|
m_bInitialized = sal_True;
|
|
|
|
m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
|
|
m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
|
|
m_rtlEncoding = encoding;
|
|
};
|
|
|
|
|
|
}
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|