office-gobmx/include/rtl/uri.h
Stephan Bergmann cef555e9a4 Make ExternalReferenceUriTranslator more robust against broken UTF-8
<https://lists.freedesktop.org/archives/libreoffice/2023-November/091151.html>
"CppunitTest_stoc_uriproc failed on Windows" reports that
translateToExternal("file:///abc/%feef") produces an empty string (indicating
failure) instead of "file:///abc/%FEef" (as expected in
stoc/test/uriproc/test_uriproc.cxx) when osl_getThreadTextEncoding() is Shift
JIS.

This was due to how the call to rtl::Uri::encode in
Translator::translateToExternal (in
stoc/source/uriproc/ExternalUriReferenceTranslator.cxx) behaved:  It internally
interpreted its input "%FE" as the single-byte Shift JIS character 0xFE.  Which
gets mapped to U+2122 as an extension (see "APPLE additions over SJIS, we
convert this like Apple, because I think, this gives better result, then [sic]
we take a replacement char" in sal/textenc/tcvtjp6.tab) in readUcs4, but which
in turn doesn't get mapped back to any Shift JIS character in writeEscapeChar.

Translator::translateToExternal is the only user of
rtl_UriEncodeStrictKeepEscapes, as introduced by
6ff5d3341d "INTEGRATION: CWS c07v013_SRC680
(1.4.40); FILE MERGED: 2007/06/21 13:00:56 sb 1.4.40.1: #b6550116# Made
XExternalUriReferenceTranslator.translateToExternal more robust when the input
URL contains spurious non--UTF-8 octets like %FE (which are now copied verbatim,
instead of signalling error)."

To make the claim true that such "spurious non--UTF-8 octets like %FE" are
always "copied verbatim", regardless of text encoding being used, repurpose
rtl_UriEncodeStrictKeepEscapes to always treat any escape sequences that are
present as (potentially broken) UTF-8.

Change-Id: I0fa0b14d3e3d44e4b5514e1b73c84c407a947ce9
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/158888
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
2023-11-03 17:26:58 +01:00

362 lines
12 KiB
C

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
/*
* This file is part of LibreOffice published API.
*/
#ifndef INCLUDED_RTL_URI_H
#define INCLUDED_RTL_URI_H
#include "sal/config.h"
#include "rtl/textenc.h"
#include "rtl/ustring.h"
#include "sal/saldllapi.h"
#include "sal/types.h"
#if defined __cplusplus
extern "C" {
#endif /* __cplusplus */
/** Various predefined URI 'char classes.'
A 'char class' defines which (ASCII) characters can be written 'as they
are' in a part of a Uri, and which characters have to be written using
escape sequences ('%' followed by two hex digits). Characters outside
the ASCII range are always written using escape sequences.
If there are other frequently used char classes, they can be added to
this enumeration; the function rtl_getUriCharClass() has to be adapted
then, too.
*/
typedef enum
{
/** The empty char class.
All characters are written using escape sequences.
*/
rtl_UriCharClassNone,
/** The RFC 2732 @<uric> char class.
@verbatim
The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and
letters.
This differs from RFC 3986 @<fragment> in additionally allowing []
@endverbatim
*/
rtl_UriCharClassUric,
/** The RFC 2396 @<uric_no_slash> char class.
@verbatim
The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters.
This differs from RFC 3986 @<fragment> in additionally encoding /
This differs from RFC 3986 @<pchar> in additionally allowing ?
@endverbatim
*/
rtl_UriCharClassUricNoSlash,
/** The RFC 2396 @<rel_segment> char class.
@verbatim
The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters.
This is the same as RFC 3986 @<segment-nz-nc>
@endverbatim
*/
rtl_UriCharClassRelSegment,
/** The RFC 2396 @<reg_name> char class.
@verbatim
The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters.
This differs from RFC 3986 @<reg_name> in additionally allowing @
@endverbatim
*/
rtl_UriCharClassRegName,
/** The RFC 2396 @<userinfo> char class.
@verbatim
The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters.
This is the same as RFC 3986 @<userinfo>
@endverbatim
*/
rtl_UriCharClassUserinfo,
/** The RFC 2396 @<pchar> char class.
@verbatim
The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters.
This differs from RFC 3986 @<pchar> in additionally encoding ;
@endverbatim
*/
rtl_UriCharClassPchar,
/** The char class for the values of uno URL parameters.
@verbatim
The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters.
@endverbatim
*/
rtl_UriCharClassUnoParamValue,
rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
}
rtl_UriCharClass;
/** The mechanism describing how escape sequences in the input of
rtl_uriEncode() are handled.
*/
typedef enum
{
/** The special meaning of '%' is ignored (i.e., there are by definition
no escape sequences in the input).
This mechanism is useful to encode user input as part of a URI (e.g.,
the user-supplied password in an ftp URL---'%20abcde' is a valid
password, so do not assume that the '%20' is an escaped space).
*/
rtl_UriEncodeIgnoreEscapes,
/** All escape sequences ('%' followed by two hex digits) are kept intact,
even if they represent characters that need not be escaped or if they
do not even map to characters in the given charset.
This mechanism is useful when passing on complete URIs more or less
unmodified (e.g., within an HTTP proxy): missing escape sequences are
added, but existing escape sequences are not touched (except that any
lower case hex digits are replaced by upper case hex digits).
*/
rtl_UriEncodeKeepEscapes,
/** All escape sequences ('%' followed by two hex digits) are resolved in
a first step; only those that represent characters that need to be
escaped are kept intact.
This mechanism is useful to properly encode complete URIs entered by
the user: the URI is brought into a 'canonic form,' but care is taken
not to damage (valid) escape sequences the (careful) user already
entered as such.
*/
rtl_UriEncodeCheckEscapes,
/** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting
unmappable characters.
@since UDK 3.2.0
*/
rtl_UriEncodeStrict,
/** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting
unmappable characters.
Also, any escape sequences that are present are always considered to be (potentially broken)
UTF-8. This mechanism is meant to be used on the result of a rtl_UriDecodeToIuri decoding,
which will thus only contain escape sequences representing either ASCII characters or broken
UTF-8 sequences, and which will all be kept as-is.
@since UDK 3.2.7
*/
rtl_UriEncodeStrictKeepEscapes,
rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
}
rtl_UriEncodeMechanism;
/** The mechanism describing how rtl_uriDecode() translates (part of) a URI
into a Unicode string.
*/
typedef enum
{
/** The text is returned completely unmodified.
*/
rtl_UriDecodeNone,
/** The text is returned in the form of an IURI (cf.
draft-masinter-url-i18n-05.txt).
All escape sequences representing ASCII characters (%00--%7F) are
kept, all other escape sequences are interpreted as UTF-8 characters
and translated to Unicode, if possible.
*/
rtl_UriDecodeToIuri,
/** The text is decoded.
All escape sequences representing characters from the given charset
are decoded and translated to Unicode, if possible.
*/
rtl_UriDecodeWithCharset,
/** Like rtl_UriDecodeWithCharset, but indicating failure when converting
unmappable characters.
@since UDK 3.2.0
*/
rtl_UriDecodeStrict,
rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
}
rtl_UriDecodeMechanism;
/** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode().
The function rtl_uriEncode() expects an array of 128 booleans, and this
function maps rtl_UriCharClass enumeration members to such arrays.
@param eCharClass
Any valid member of rtl_UriCharClass.
@return
An array of 128 booleans, to be used in calls to rtl_uriEncode().
*/
SAL_DLLPUBLIC sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
SAL_THROW_EXTERN_C();
/** Encode a text as (part of) a URI.
@param pText
Any Unicode string. Must not be null.
@param pCharClass
A char class, represented as an array of 128 booleans (true means keep the
corresponding ASCII character unencoded, false means encode it). Must not
be null, and the boolean corresponding to the percent sign (0x25) must be
false. (See rtl_getUriCharClass() for a function mapping from
rtl_UriCharClass to such arrays.)
@param eMechanism
The mechanism describing how escape sequences in the input text are
handled.
@param eCharset
When Unicode characters from the input text have to be written using
escape sequences (because they are either outside the ASCII range or do
not belong to the given char class), they are first translated into this
charset before being encoded using escape sequences.
Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape
sequences already present in the input text are interpreted as characters
from this charset.
@param pResult
Returns an encoded representation of the input text. Must itself not be
null, and must point to either null or a valid string.
If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be
converted to eCharset because it contains unmappable characters (which
implies that pText is not empty), then an empty string is returned.
*/
SAL_DLLPUBLIC void SAL_CALL rtl_uriEncode(
rtl_uString * pText,
sal_Bool const * pCharClass,
rtl_UriEncodeMechanism eMechanism,
rtl_TextEncoding eCharset,
rtl_uString ** pResult)
SAL_THROW_EXTERN_C();
/** Decode (a part of) a URI.
@param pText
Any Unicode string. Must not be null. (If the input is indeed part of a
valid URI, this string will only contain a subset of the ASCII characters,
but this function also handles other Unicode characters properly.)
@param eMechanism
The mechanism describing how the input text is translated into a Unicode
string.
@param eCharset
When the decode mechanism is rtl_UriDecodeWithCharset, all escape
sequences in the input text are interpreted as characters from this
charset. Those characters are translated to Unicode characters in the
resulting output, if possible.
When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri,
this parameter is ignored (and is best specified as
RTL_TEXTENCODING_UTF8).
@param pResult
Returns a decoded representation of the input text. Must itself not be
null, and must point to either null or a valid string.
If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be
converted to eCharset because it contains (encodings of) unmappable
characters (which implies that pText is not empty), then an empty string is
returned.
*/
SAL_DLLPUBLIC void SAL_CALL rtl_uriDecode(
rtl_uString * pText,
rtl_UriDecodeMechanism eMechanism,
rtl_TextEncoding eCharset,
rtl_uString ** pResult)
SAL_THROW_EXTERN_C();
/** Convert a relative URI reference into an absolute URI.
This function uses the strict parser algorithm described in RFC 3986,
section 5.2.
This function signals exceptions by returning false and letting pException
point to a message explaining the exception.
@param pBaseUriRef
An absolute URI that serves as the base URI. If it has to be inspected
(i.e., pRelUriRef is not an absolute URI already), and it is not an absolute
URI (i.e., does not begin with a @<scheme ":"> part), an exception will be
signaled.
@param pRelUriRef
A URI reference that may be either absolute or relative. If it is
absolute, it will be returned unmodified.
@param pResult
Returns an absolute URI. Must itself not be null, and must point to either
null or a valid string. If an exception is signalled, it is left unchanged.
@param pException
Returns an explanatory message in case an exception is signalled. Must
itself not be null, and must point to either null or a valid string. If no
exception is signalled, it is left unchanged.
@return
True if no exception is signalled, otherwise false.
*/
SAL_DLLPUBLIC sal_Bool SAL_CALL rtl_uriConvertRelToAbs(
rtl_uString * pBaseUriRef,
rtl_uString * pRelUriRef,
rtl_uString ** pResult,
rtl_uString ** pException)
SAL_THROW_EXTERN_C();
#if defined __cplusplus
}
#endif /* __cplusplus */
#endif // INCLUDED_RTL_URI_H
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */