a2f1be46f2
Change-Id: I4359b7042f98586e2c9f5529d83d769cdf3d033c Reviewed-on: https://gerrit.libreoffice.org/85775 Tested-by: Jenkins Reviewed-by: Noel Grandin <noel.grandin@collabora.co.uk>
172 lines
6.9 KiB
C++
172 lines
6.9 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* This file incorporates work covered by the following license notice:
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
*/
|
|
|
|
#include "HtmlFmtFlt.hxx"
|
|
|
|
#include <rtl/string.h>
|
|
#include <osl/diagnose.h>
|
|
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <vector>
|
|
#include <iomanip>
|
|
#include <cassert>
|
|
|
|
using namespace com::sun::star::uno;
|
|
|
|
// converts the openoffice text/html clipboard format to the HTML Format
|
|
// well known under MS Windows
|
|
// the MS HTML Format has a header before the real html data
|
|
|
|
// Version:1.0 Version number of the clipboard. Starting is 0.9
|
|
// StartHTML: Byte count from the beginning of the clipboard to the start
|
|
// of the context, or -1 if no context
|
|
// EndHTML: Byte count from the beginning of the clipboard to the end
|
|
// of the context, or -1 if no context
|
|
// StartFragment: Byte count from the beginning of the clipboard to the
|
|
// start of the fragment
|
|
// EndFragment: Byte count from the beginning of the clipboard to the
|
|
// end of the fragment
|
|
// StartSelection: Byte count from the beginning of the clipboard to the
|
|
// start of the selection
|
|
// EndSelection: Byte count from the beginning of the clipboard to the
|
|
// end of the selection
|
|
|
|
// StartSelection and EndSelection are optional
|
|
// The fragment should be preceded and followed by the HTML comments
|
|
// <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
|
|
// text
|
|
|
|
namespace
|
|
{
|
|
std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment,
|
|
size_t endFragment)
|
|
{
|
|
std::ostringstream htmlHeader;
|
|
htmlHeader << "Version:1.0" << '\r' << '\n';
|
|
htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml
|
|
<< '\r' << '\n';
|
|
htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r'
|
|
<< '\n';
|
|
htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec
|
|
<< startFragment << '\r' << '\n';
|
|
htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment
|
|
<< '\r' << '\n';
|
|
return htmlHeader.str();
|
|
}
|
|
}
|
|
|
|
// the office always writes the start and end html tag in upper cases and
|
|
// without spaces both tags don't allow parameters
|
|
const std::string TAG_HTML = std::string("<html>");
|
|
const std::string TAG_END_HTML = std::string("</html>");
|
|
|
|
// The body tag may have parameters so we need to search for the
|
|
// closing '>' manually e.g. <BODY param> #92840#
|
|
const std::string TAG_BODY = std::string("<body");
|
|
const std::string TAG_END_BODY = std::string("</body");
|
|
|
|
Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8> const& aTextHtml)
|
|
{
|
|
OSL_ASSERT(aTextHtml.getLength() > 0);
|
|
|
|
if (aTextHtml.getLength() <= 0)
|
|
return Sequence<sal_Int8>();
|
|
|
|
// fill the buffer with dummy values to calc the exact length
|
|
std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
|
|
size_t lHtmlFormatHeader = dummyHtmlHeader.length();
|
|
|
|
std::string textHtml(reinterpret_cast<const char*>(aTextHtml.getConstArray()),
|
|
reinterpret_cast<const char*>(aTextHtml.getConstArray())
|
|
+ aTextHtml.getLength());
|
|
|
|
std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader
|
|
- 1; // we start one before '<HTML>' Word 2000 does also so
|
|
std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader
|
|
+ TAG_END_HTML.length()
|
|
+ 1; // our SOffice 5.2 wants 2 behind </HTML>?
|
|
|
|
// The body tag may have parameters so we need to search for the
|
|
// closing '>' manually e.g. <BODY param> #92840#
|
|
std::string::size_type nStartFragment
|
|
= textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
|
|
std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
|
|
|
|
std::string htmlFormat
|
|
= GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
|
|
htmlFormat += textHtml;
|
|
|
|
Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
|
|
memset(byteSequence.getArray(), 0, byteSequence.getLength());
|
|
|
|
memcpy(static_cast<void*>(byteSequence.getArray()),
|
|
static_cast<const void*>(htmlFormat.c_str()), htmlFormat.length());
|
|
|
|
return byteSequence;
|
|
}
|
|
|
|
const char* const HtmlStartTag = "<html";
|
|
|
|
Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
|
|
{
|
|
assert(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
|
|
|
|
Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast<Sequence<sal_Int8>&>(aHTMLFormat);
|
|
char* dataStart = reinterpret_cast<char*>(nonconstHTMLFormatRef.getArray());
|
|
char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
|
|
const char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
|
|
|
|
assert(htmlStartTag && "Seems to be no HTML at all");
|
|
|
|
// It doesn't seem to be HTML? Well then simply return what has been
|
|
// provided in non-debug builds
|
|
if (htmlStartTag == nullptr)
|
|
{
|
|
return aHTMLFormat;
|
|
}
|
|
|
|
sal_Int32 len = dataEnd - htmlStartTag;
|
|
Sequence<sal_Int8> plainHtmlData(len);
|
|
|
|
memcpy(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
|
|
|
|
return plainHtmlData;
|
|
}
|
|
|
|
/* A simple format detection. We are just comparing the first few bytes
|
|
of the provided byte sequence to see whether or not it is the MS
|
|
Office Html format. If it shows that this is not reliable enough we
|
|
can improve this
|
|
*/
|
|
const char HtmlFormatStart[] = "Version:";
|
|
int const HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1);
|
|
|
|
bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
|
|
{
|
|
if (aHtmlSequence.getLength() < HtmlFormatStartLen)
|
|
return false;
|
|
|
|
return rtl_str_compareIgnoreAsciiCase_WithLength(
|
|
HtmlFormatStart, HtmlFormatStartLen,
|
|
reinterpret_cast<const char*>(aHtmlSequence.getConstArray()), HtmlFormatStartLen)
|
|
== 0;
|
|
}
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|