/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 . */ #include #include #include #include #include #include #include #include #include #include using namespace pdfparse; static void printHelp( const char* pExe ) { fprintf( stdout, "USAGE: %s [-h,--help]\n" " %s [-pw, --password ] []\n" " %s <-a, --extract-add-streams> [-pw, --password ] []\n" " %s <-f, --extract-fonts> [-pw, --password ] []\n" " %s <-o, --extract-objects> [:][,[:g1][,...]] [-pw, --password ] []\n" " -h, --help: show help\n" " -a, --extract-add-streams: extracts additional streams to outputfile_object\n" " and prints the mimetype found to stdout\n" " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n" " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n" " object numbers, where object number and generation number are separated by \':\'\n" " an omitted generation number defaults to 0\n" " -pw, --password: use password for decryption\n" "\n" "note: -f, -a, -o and normal unzip operation are mutually exclusive\n" , pExe, pExe, pExe, pExe, pExe ); } namespace { class FileEmitContext : public EmitContext { oslFileHandle m_aHandle; oslFileHandle m_aReadHandle; unsigned int m_nReadLen; void openReadFile( const char* pOrigName ); public: FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop ); virtual ~FileEmitContext() override; virtual bool write( const void* pBuf, unsigned int nLen ) noexcept override; virtual unsigned int getCurPos() noexcept override; virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept override; virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept override; }; } FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop ) : EmitContext( pTop ), m_aHandle( nullptr ), m_aReadHandle( nullptr ), m_nReadLen( 0 ) { OUString aSysFile( OStringToOUString( std::string_view( pFileName ), osl_getThreadTextEncoding() ) ); OUString aURL; if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None ) { fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName ); return; } if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None ) { if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None ) { fprintf( stderr, "could not truncate %s\n", pFileName ); osl_closeFile( m_aHandle ); m_aHandle = nullptr; } } else if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None ) { fprintf( stderr, "could not open %s\n", pFileName ); return; } m_bDeflate = true; openReadFile( pOrigName ); } FileEmitContext::~FileEmitContext() { if( m_aHandle ) osl_closeFile( m_aHandle ); if( m_aReadHandle ) osl_closeFile( m_aReadHandle ); } void FileEmitContext::openReadFile( const char* pInFile ) { OUString aSysFile( OStringToOUString( std::string_view( pInFile ), osl_getThreadTextEncoding() ) ); OUString aURL; if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None ) { fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile ); return; } if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None ) { fprintf( stderr, "could not open %s\n", pInFile ); return; } if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None ) { fprintf( stderr, "could not seek to end of %s\n", pInFile ); osl_closeFile( m_aReadHandle ); return; } sal_uInt64 nFileSize = 0; if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None ) { fprintf( stderr, "could not get end pos of %s\n", pInFile ); osl_closeFile( m_aReadHandle ); return; } m_nReadLen = static_cast(nFileSize); } bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) noexcept { if( ! m_aHandle ) return false; sal_uInt64 nWrite = static_cast(nLen); sal_uInt64 nWritten = 0; return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None) && nWrite == nWritten; } unsigned int FileEmitContext::getCurPos() noexcept { sal_uInt64 nFileSize = 0; if( m_aHandle ) { if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None ) nFileSize = 0; } return static_cast(nFileSize); } bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept { if( nOrigOffset + nLen > m_nReadLen ) return false; if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None ) { fprintf( stderr, "could not seek to offset %u\n", nOrigOffset ); return false; } void* pBuf = std::malloc( nLen ); if( ! pBuf ) return false; sal_uInt64 nBytesRead = 0; if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None || nBytesRead != static_cast(nLen) ) { fprintf( stderr, "could not read %u bytes\n", nLen ); std::free( pBuf ); return false; } bool bRet = write( pBuf, nLen ); std::free( pBuf ); return bRet; } unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept { if( nOrigOffset + nLen > m_nReadLen ) return 0; if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None ) { fprintf( stderr, "could not seek to offset %u\n", nOrigOffset ); return 0; } sal_uInt64 nBytesRead = 0; if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None ) return 0; return static_cast(nBytesRead); } typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*); static int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl ) { int nRet = 0; std::unique_ptr pEntry = pdfparse::PDFReader::read(OStringToOUString(pInFile, osl_getThreadTextEncoding())); if( pEntry ) { PDFFile* pPDFFile = dynamic_cast(pEntry.get()); if( pPDFFile ) { fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" ); if( pPassword ) fprintf( stdout, "password %s\n", pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" ); nRet = pHdl( pInFile, pOutFile, pPDFFile ); } else nRet = 20; } return nRet; } static int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile ) { FileEmitContext aContext( pOutFile, pInFile, pPDFFile ); aContext.m_bDecrypt = pPDFFile->isEncrypted(); pPDFFile->emit(aContext); return 0; } static int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile ) { int nRet = 0; unsigned int nArrayElements = pStreams->m_aSubElements.size(); for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ ) { PDFName* pMimeType = dynamic_cast(pStreams->m_aSubElements[i].get()); PDFObjectRef* pStreamRef = dynamic_cast(pStreams->m_aSubElements[i+1].get()); if( ! pMimeType ) fprintf( stderr, "error: no mimetype element\n" ); if( ! pStreamRef ) fprintf( stderr, "error: no stream ref element\n" ); if( pMimeType && pStreamRef ) { fprintf( stdout, "found stream %d %d with mimetype %s\n", pStreamRef->m_nNumber, pStreamRef->m_nGeneration, pMimeType->m_aName.getStr() ); PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration ); if( pObject ) { OString aOutStream = pOutFile + OString::Concat("_stream_") + OString::number( sal_Int32(pStreamRef->m_nNumber) ) + "_" + OString::number( sal_Int32(pStreamRef->m_nGeneration) ); FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile ); aContext.m_bDecrypt = pPDFFile->isEncrypted(); pObject->writeStream( aContext, pPDFFile ); } else { fprintf( stderr, "object not found\n" ); nRet = 121; } } else nRet = 120; } return nRet; } static int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile ) { // find all trailers int nRet = 0; unsigned int nElements = pPDFFile->m_aSubElements.size(); for( unsigned i = 0; i < nElements && nRet == 0; i++ ) { PDFTrailer* pTrailer = dynamic_cast(pPDFFile->m_aSubElements[i].get()); if( pTrailer && pTrailer->m_pDict ) { // search for AdditionalStreams entry auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams"_ostr ); if( add_stream != pTrailer->m_pDict->m_aMap.end() ) { PDFArray* pStreams = dynamic_cast(add_stream->second); if( pStreams ) nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile ); } } } return nRet; } static int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile ) { unsigned int nElements = i_pPDFFile->m_aSubElements.size(); for (unsigned i = 0; i < nElements; i++) { // search FontDescriptors PDFObject* pObj = dynamic_cast(i_pPDFFile->m_aSubElements[i].get()); if( ! pObj ) continue; PDFDict* pDict = dynamic_cast(pObj->m_pObject); if( ! pDict ) continue; std::unordered_map::iterator map_it = pDict->m_aMap.find( "Type"_ostr ); if( map_it == pDict->m_aMap.end() ) continue; PDFName* pName = dynamic_cast(map_it->second); if( ! pName ) continue; if( pName->m_aName != "FontDescriptor" ) continue; // the font name will be helpful, also there must be one in // a font descriptor map_it = pDict->m_aMap.find( "FontName"_ostr ); if( map_it == pDict->m_aMap.end() ) continue; pName = dynamic_cast(map_it->second); if( ! pName ) continue; OString aFontName( pName->m_aName ); PDFObjectRef* pStreamRef = nullptr; const char* pFileType = nullptr; // we have a font descriptor, try for a type 1 font map_it = pDict->m_aMap.find( "FontFile"_ostr ); if( map_it != pDict->m_aMap.end() ) { pStreamRef = dynamic_cast(map_it->second); if( pStreamRef ) pFileType = "pfa"; } // perhaps it's a truetype file ? if( ! pStreamRef ) { map_it = pDict->m_aMap.find( "FontFile2"_ostr ); if( map_it != pDict->m_aMap.end() ) { pStreamRef = dynamic_cast(map_it->second); if( pStreamRef ) pFileType = "ttf"; } } if( ! pStreamRef ) continue; PDFObject* pStream = i_pPDFFile->findObject( pStreamRef ); if( ! pStream ) continue; OStringBuffer aOutStream( OString::Concat(i_pOutFile) + "_font_" + OString::number( sal_Int32(pStreamRef->m_nNumber) ) + "_" + OString::number( sal_Int32(pStreamRef->m_nGeneration) ) + "_" + aFontName ); if( pFileType ) { aOutStream.append( OString::Concat(".") + pFileType ); } FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile ); aContext.m_bDecrypt = i_pPDFFile->isEncrypted(); pStream->writeStream( aContext, i_pPDFFile ); } return 0; } static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects; static int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile ) { unsigned int nElements = s_aEmitObjects.size(); for (unsigned i = 0; i < nElements; i++) { sal_Int32 nObject = s_aEmitObjects[i].first; sal_Int32 nGeneration = s_aEmitObjects[i].second; PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration ); if( ! pStream ) { fprintf( stderr, "object %d %d not found !\n", static_cast(nObject), static_cast(nGeneration) ); continue; } OString aOutStream = i_pOutFile + OString::Concat("_stream_") + OString::number( nObject ) + "_" + OString::number( nGeneration ); FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile ); aContext.m_bDecrypt = i_pPDFFile->isEncrypted(); pStream->writeStream( aContext, i_pPDFFile ); } return 0; } SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv ) { const char* pInFile = nullptr; const char* pOutFile = nullptr; const char* pPassword = nullptr; OStringBuffer aOutFile( 256 ); PDFFileHdl aHdl = write_unzipFile; for( int nArg = 1; nArg < argc; nArg++ ) { if( argv[nArg][0] == '-' ) { if( ! rtl_str_compare( "-pw", argv[nArg] ) || ! rtl_str_compare( "--password" , argv[nArg] ) ) { if( nArg == argc-1 ) { fprintf( stderr, "no password given\n" ); return 1; } nArg++; pPassword = argv[nArg]; } else if( ! rtl_str_compare( "-h", argv[nArg] ) || ! rtl_str_compare( "--help", argv[nArg] ) ) { printHelp( argv[0] ); return 0; } else if( ! rtl_str_compare( "-a", argv[nArg] ) || ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) ) { aHdl = write_addStreams; } else if( ! rtl_str_compare( "-f", argv[nArg] ) || ! rtl_str_compare( "--extract-fonts", argv[nArg] ) ) { aHdl = write_fonts; } else if( ! rtl_str_compare( "-o", argv[nArg] ) || ! rtl_str_compare( "--extract-objects", argv[nArg] ) ) { aHdl = write_objects; nArg++; if( nArg < argc ) { OString aObjs( argv[nArg] ); sal_Int32 nIndex = 0; while( nIndex != -1 ) { OString aToken( aObjs.getToken( 0, ',', nIndex ) ); sal_Int32 nObject = 0; sal_Int32 nGeneration = 0; sal_Int32 nGenIndex = 0; nObject = o3tl::toInt32( o3tl::getToken( aToken, 0, ':', nGenIndex ) ); if( nGenIndex != -1 ) nGeneration = o3tl::toInt32( o3tl::getToken(aToken, 0, ':', nGenIndex )); s_aEmitObjects.push_back( std::pair(nObject,nGeneration) ); } } } else { fprintf( stderr, "unrecognized option \"%s\"\n", argv[nArg] ); printHelp( argv[0] ); return 1; } } else if( pInFile == nullptr ) pInFile = argv[nArg]; else if( pOutFile == nullptr ) pOutFile = argv[nArg]; } if( ! pInFile ) { fprintf( stderr, "no input file given\n" ); return 10; } if( ! pOutFile ) { OString aFile( pInFile ); if( aFile.getLength() > 0 ) { if( aFile.getLength() > 4 ) { if( aFile.matchIgnoreAsciiCase( ".pdf", aFile.getLength()-4 ) ) aOutFile.append( pInFile, aFile.getLength() - 4 ); else aOutFile.append( aFile ); } aOutFile.append( "_unzip.pdf" ); pOutFile = aOutFile.getStr(); } else { fprintf( stderr, "no output file given\n" ); return 11; } } return handleFile( pInFile, pOutFile, pPassword, aHdl ); } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */