office-gobmx/sal/textenc/tcvtutf8.c
2001-10-17 13:35:30 +00:00

412 lines
15 KiB
C

/*************************************************************************
*
* $RCSfile: tcvtutf8.c,v $
*
* $Revision: 1.3 $
*
* last change: $Author: sb $ $Date: 2001-10-17 14:35:30 $
*
* The Contents of this file are made available subject to the terms of
* either of the following licenses
*
* - GNU Lesser General Public License Version 2.1
* - Sun Industry Standards Source License Version 1.1
*
* Sun Microsystems Inc., October, 2000
*
* GNU Lesser General Public License Version 2.1
* =============================================
* Copyright 2000 by Sun Microsystems, Inc.
* 901 San Antonio Road, Palo Alto, CA 94303, USA
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
*
* Sun Industry Standards Source License Version 1.1
* =================================================
* The contents of this file are subject to the Sun Industry Standards
* Source License Version 1.1 (the "License"); You may not use this file
* except in compliance with the License. You may obtain a copy of the
* License at http://www.openoffice.org/license.html.
*
* Software provided under this License is provided on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
* WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
* MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
* See the License for the specific provisions governing your rights and
* obligations concerning the Software.
*
* The Initial Developer of the Original Code is: Sun Microsystems, Inc.
*
* Copyright: 2000 by Sun Microsystems, Inc.
*
* All Rights Reserved.
*
* Contributor(s): _______________________________________
*
*
************************************************************************/
#ifndef INCLUDED_RTL_TEXTENC_TENCHELP_H
#include "tenchelp.h"
#endif
#ifndef INCLUDED_RTL_TEXTENC_UNICHARS_H
#include "unichars.h"
#endif
#ifndef _RTL_TEXTCVT_H
#include "rtl/textcvt.h"
#endif
/* ----------------------------------------------------------------------- */
sal_Size ImplUTF8ToUnicode( const ImplTextConverterData* pData, void* pContext,
const sal_Char* pSrcBuf, sal_Size nSrcBytes,
sal_Unicode* pDestBuf, sal_Size nDestChars,
sal_uInt32 nFlags, sal_uInt32* pInfo,
sal_Size* pSrcCvtBytes )
{
static sal_uInt8 const nExtraBytesFromUTF8Tab[16] =
{
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 0, 0
};
static sal_uInt8 const nFirstByteMaskTab[3] =
{
0x07, 0x03, 0x01
};
sal_uInt8 nBytes;
sal_uInt8 nTempBytes;
sal_uChar c;
sal_uInt32 cConv;
sal_Unicode* pEndDestBuf;
const sal_Char* pEndSrcBuf;
*pInfo = 0;
pEndDestBuf = pDestBuf+nDestChars;
pEndSrcBuf = pSrcBuf+nSrcBytes;
while ( pSrcBuf < pEndSrcBuf )
{
if ( pDestBuf == pEndDestBuf )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
break;
}
c = (sal_uChar)*pSrcBuf;
/* 1 Byte */
/* 0aaaaaaa (000000000aaaaaaa) */
if ( !(c & 0x80) )
{
*pDestBuf = (sal_Unicode)c;
pDestBuf++;
pSrcBuf++;
}
/* 2-3 Bytes */
else if ( (c & 0xF0) != 0xF0 )
{
/* 110aaaaa 10bbbbbb (00000aaaaabbbbbb) */
if ( (c & 0xE0) == 0xC0 )
{
nBytes = 2;
c &= 0x1F; /* 00001111; */
}
/* 1110aaaa 10bbbbbb 10cccccc (aaaabbbbbbcccccc) */
else if ( (c & 0xF0) == 0xE0 )
{
nBytes = 3;
c &= 0x0F; /* 00001111; */
}
else
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
break;
}
else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE )
{
if ( pDestBuf >= pEndDestBuf )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
break;
}
*pDestBuf++ = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
}
pSrcBuf++;
continue;
}
if ( pSrcBuf+nBytes > pEndSrcBuf )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
break;
}
cConv = c;
pSrcBuf++;
if ( (*pSrcBuf & 0xC0) != 0x80 )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
{
pSrcBuf--;
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
break;
}
else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE )
*pDestBuf++ = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
continue;
}
else
{
c = (sal_uChar)*pSrcBuf;
cConv <<= 6;
cConv += c & 0x3F; /* 00111111 */
}
if ( nBytes == 3 )
{
pSrcBuf++;
if ( (*pSrcBuf & 0xC0) != 0x80 )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
{
pSrcBuf -= 2;
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
break;
}
else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE )
*pDestBuf++
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
continue;
}
else
{
c = (sal_uChar)*pSrcBuf;
cConv <<= 6;
cConv += c & 0x3F; /* 00111111 */
}
}
*pDestBuf = (sal_Unicode)cConv;
pDestBuf++;
pSrcBuf++;
}
/* 4-6 Bytes */
else
{
/* convert to ucs4 */
nBytes = nExtraBytesFromUTF8Tab[c & 0x0F];
if ( !nBytes )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
break;
}
else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE )
{
if ( pDestBuf >= pEndDestBuf )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
break;
}
*pDestBuf++ = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
}
pSrcBuf++;
continue;
}
else if ( pSrcBuf+nBytes+1 > pEndSrcBuf )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
break;
}
cConv = c & nFirstByteMaskTab[nBytes-3];
nTempBytes = nBytes;
do
{
pSrcBuf++;
if ( (*pSrcBuf & 0xC0) != 0x80 )
break;
c = (sal_uChar)*pSrcBuf;
cConv <<= 6;
cConv += c & 0x3F; /* 00111111 */
nTempBytes--;
}
while ( nTempBytes );
if ( nTempBytes )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
{
pSrcBuf -= nBytes-nTempBytes+1;
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
break;
}
else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE )
*pDestBuf++ = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
pSrcBuf++;
continue;
}
else
{
pSrcBuf++;
if ( cConv > 0x10FFFF )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED;
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
break;
}
else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) != RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE )
*pDestBuf++
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
}
else if ( pDestBuf+2 > pEndDestBuf )
{
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
break;
}
else
{
*pDestBuf++ = (sal_Unicode) ImplGetHighSurrogate(cConv);
*pDestBuf++ = (sal_Unicode) ImplGetLowSurrogate(cConv);
}
}
}
}
*pSrcCvtBytes = nSrcBytes - (pEndSrcBuf-pSrcBuf);
return (nDestChars - (pEndDestBuf-pDestBuf));
}
/* ----------------------------------------------------------------------- */
sal_Size ImplUnicodeToUTF8( const ImplTextConverterData* pData, void* pContext,
const sal_Unicode* pSrcBuf, sal_Size nSrcChars,
sal_Char* pDestBuf, sal_Size nDestBytes,
sal_uInt32 nFlags, sal_uInt32* pInfo,
sal_Size* pSrcCvtChars )
{
static sal_uInt8 const nFirstByteMarkTab[6] =
{
0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
};
sal_Unicode c;
sal_Unicode c2;
sal_uInt32 nUCS4Char;
sal_uInt8 nBytes;
sal_Char* pTempDestBuf;
sal_Char* pEndDestBuf;
const sal_Unicode* pEndSrcBuf;
*pInfo = 0;
pEndDestBuf = pDestBuf+nDestBytes;
pEndSrcBuf = pSrcBuf+nSrcChars;
while ( pSrcBuf < pEndSrcBuf )
{
c = *pSrcBuf;
if ( c < 0x80 )
{
if ( pDestBuf == pEndDestBuf )
{
*pInfo |= RTL_UNICODETOTEXT_INFO_ERROR | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
break;
}
*pDestBuf = (sal_Char)(sal_uChar)c;
pDestBuf++;
pSrcBuf++;
}
else
{
nUCS4Char = c;
if ( nUCS4Char < 0x800 )
nBytes = 2;
else
{
if (ImplIsHighSurrogate(c))
{
if ( pSrcBuf == pEndSrcBuf )
{
*pInfo |= RTL_UNICODETOTEXT_INFO_ERROR | RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
break;
}
c2 = *(pSrcBuf+1);
if (ImplIsLowSurrogate(c2))
{
nUCS4Char = ImplCombineSurrogates(c, c2);
pSrcBuf++;
}
else
{
*pInfo |= RTL_UNICODETOTEXT_INFO_INVALID;
if ( (nFlags & RTL_UNICODETOTEXT_FLAGS_INVALID_MASK) == RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR )
{
*pInfo |= RTL_UNICODETOTEXT_INFO_ERROR;
break;
}
else if ( (nFlags & RTL_UNICODETOTEXT_FLAGS_INVALID_MASK) == RTL_UNICODETOTEXT_FLAGS_INVALID_IGNORE )
{
pSrcBuf++;
continue;
}
/* in UTF8 we save the original code. I think */
/* this is better than the default char, */
/* because it is a unicode format. */
}
}
if ( nUCS4Char < 0x10000 )
nBytes = 3;
else if ( nUCS4Char < 0x200000 )
nBytes = 4;
else if ( nUCS4Char < 0x4000000 )
nBytes = 5;
else
nBytes = 6;
}
if ( pDestBuf+nBytes > pEndDestBuf )
{
*pInfo |= RTL_UNICODETOTEXT_INFO_ERROR | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
break;
}
pDestBuf += nBytes;
pTempDestBuf = pDestBuf;
switch ( nBytes ) /* no breaks, only jump table */
{
case 6: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6;
case 5: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6;
case 4: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6;
case 3: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6;
case 2: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6;
};
pTempDestBuf--;
*pTempDestBuf = (sal_Char)(((sal_uChar)nUCS4Char) | nFirstByteMarkTab[nBytes-1]);
pSrcBuf++;
}
}
*pSrcCvtChars = nSrcChars - (pEndSrcBuf-pSrcBuf);
return (nDestBytes - (pEndDestBuf-pDestBuf));
}