mirror of
https://github.com/ONLYOFFICE/core.git
synced 2026-04-07 13:55:33 +08:00
538 lines
14 KiB
C++
538 lines
14 KiB
C++
#include "../HTMLReaderLib/LiteHTMLReader.h"
|
|
#include "../HTMLReaderLib/LiteHTMLConverter.h"
|
|
#include "Windows.h"
|
|
|
|
static const TCHAR *pcRootHtml = _T("<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">");
|
|
static const TCHAR *pcXmlSubtag = _T("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
|
|
static const TCHAR *pcDoctypeSubtag = _T("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">");
|
|
static const TCHAR *pcHtmlTagStart = _T("<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">");
|
|
static const TCHAR *pcHtmlTagEnd = _T("</html>");
|
|
static const TCHAR *pcTagEnd = _T(">");
|
|
|
|
|
|
CLiteHTMLElemAttr::CNamedColors CLiteHTMLElemAttr::_namedColors(166 /* block size */);
|
|
CLiteHTMLConverter::CTagsInfo CLiteHTMLConverter::m_mTagsInfo(200 /* block size */);
|
|
|
|
|
|
class CXHTMLWriter: public IXHTMLWriter
|
|
{
|
|
private:
|
|
CString m_sOutput;
|
|
HANDLE m_hFile;
|
|
DWORD m_dwErrorCode;
|
|
BOOL m_bLastTagClosed;
|
|
|
|
public:
|
|
CXHTMLWriter ()
|
|
: m_hFile (NULL)
|
|
, m_dwErrorCode (0)
|
|
, m_bLastTagClosed (TRUE)
|
|
{
|
|
}
|
|
virtual ~CXHTMLWriter ()
|
|
{
|
|
CloseOutput ();
|
|
}
|
|
protected:
|
|
|
|
// preprocess <img> tag
|
|
BOOL PreprocessImg (CHTMLTagWrapper *pTagWrapper)
|
|
{
|
|
/*
|
|
CLiteHTMLTag *pLiteTag = pTagWrapper->Tag();
|
|
CLiteHTMLAttributes *pAttrs = pLiteTag->getAttributes();
|
|
|
|
for (int nAttr = 0; nAttr < pAttrs->getCount(); nAttr++)
|
|
{
|
|
|
|
CLiteHTMLElemAttr oAttr = pAttrs->getAttribute(nAttr);
|
|
CString sAttrName = oAttr.getName();
|
|
CString sAttrValue = oAttr.getValue();
|
|
|
|
if (sAttrName.GetLength() <= 0 || sAttrName.Find(_T(":")) != -1 || sAttrValue.GetLength() <= 0)
|
|
continue;
|
|
|
|
sAttrName.MakeLower();
|
|
|
|
if (sAttrName = _T("src"))
|
|
{
|
|
CString sValueEncode = sAttrValue;
|
|
HtmlUtils::DecodeUrl (sAttrValue);
|
|
CString sResultPath = _T("");
|
|
CString sFromRelativePath = HtmlUtils::full_from_relative( (const std::wstring) m_sOutput.GetBuffer(), (const std::wstring) sAttrValue.GetBuffer() );
|
|
|
|
if (HtmlUtils::FileExist (&sAttrValue))
|
|
{
|
|
sResultPath = sAttrValue;
|
|
}
|
|
else if (HtmlUtils::FileExist(&sFromRelativePath) )
|
|
{
|
|
sResultPath = sFromRelativePath;
|
|
}
|
|
else
|
|
{
|
|
CString sPath = sValueEncode;
|
|
if ( m_bDownloadImages && ( -1 != sPath.Find( _T("www") ) || -1 != sPath.Find( _T("http") ) || -1 != sPath.Find( _T("ftp") ) ) )
|
|
{
|
|
CFileDownloader oDownloader (sPath, FALSE);
|
|
oDownloader.StartWork (1);
|
|
while ( oDownloader.IsRunned() )
|
|
{
|
|
::Sleep( 10 );
|
|
}
|
|
|
|
if ( oDownloader.IsFileDownloaded() )
|
|
{
|
|
sResultPath = oDownloader.GetFilePath().GetBuffer();
|
|
m_arrImageFileName.Add( oDownloader.GetFilePath() );
|
|
}
|
|
else
|
|
sResultPath = _T("");
|
|
}
|
|
}
|
|
if (!sResultPath.IsEmpty())
|
|
{
|
|
// check width and height
|
|
CSS::CHtmlCSS oTempCSS(_T(""), pAttrs->getValueFromName(_T("style")));
|
|
|
|
CSS::CHtmlCSSProperty *pProperty = NULL;
|
|
if (NULL != (pProperty = oTempCss.get_Property (CSS::ptWidth))) // width property
|
|
{
|
|
}
|
|
}
|
|
}
|
|
}
|
|
*/
|
|
}
|
|
BOOL PreprocessA (CHTMLTagWrapper *pTagWrapper)
|
|
{}
|
|
BOOL PreprocessFrame (CHTMLTagWrapper *pTagWrapper)
|
|
{}
|
|
BOOL PreprocessIframe (CHTMLTagWrapper *pTagWrapper)
|
|
{}
|
|
BOOL ProcessTag (CHTMLTagWrapper *pTagWrapper)
|
|
{
|
|
//
|
|
ATLASSERT (pTagWrapper);
|
|
|
|
CLiteHTMLTag *pLiteTag = pTagWrapper->Tag();
|
|
|
|
// Preprocess specific tags (Step 1)
|
|
CString sTagName = pLiteTag->getTagName();
|
|
if (_T("img") == sTagName)
|
|
PreprocessImg (pTagWrapper);
|
|
else if (_T("a") == sTagName)
|
|
PreprocessA (pTagWrapper);
|
|
else if (_T("frame") == sTagName)
|
|
PreprocessFrame (pTagWrapper);
|
|
else if (_T("iframe") == sTagName)
|
|
PreprocessIframe (pTagWrapper);
|
|
|
|
//
|
|
|
|
}
|
|
protected:
|
|
|
|
BOOL WriteMultiByte (const BYTE *aBuffer, DWORD aSize, UINT aCodePage)
|
|
{
|
|
if (NULL == m_hFile)
|
|
return FALSE;
|
|
|
|
// convert multibyte to widechar
|
|
if (CP_UTF8 == aCodePage)
|
|
return WriteUTF8 ((const char*) aBuffer, aSize);
|
|
|
|
const int nWideCharSize = MultiByteToWideChar(aCodePage, // code page
|
|
0, // character-type options
|
|
(LPCSTR) aBuffer, // string to map
|
|
aSize, // number of bytes in string
|
|
NULL, // wide-character buffer
|
|
0 // size of buffer
|
|
);
|
|
|
|
if (0 == nWideCharSize ||
|
|
ERROR_NO_UNICODE_TRANSLATION == nWideCharSize)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteMultiByte() failed! (MultiByteToWideChar 0x%x)", GetLastError());
|
|
return FALSE;
|
|
}
|
|
wchar_t *aWideChars = new wchar_t [nWideCharSize];
|
|
if (NULL == aWideChars)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteMultiByte() failed! (no memory 1)");
|
|
return FALSE;
|
|
}
|
|
|
|
const int nMultiByteToWideCharResult = MultiByteToWideChar(aCodePage, // code page
|
|
0, // character-type options
|
|
(LPCSTR) aBuffer, // string to map
|
|
aSize, // number of bytes in string
|
|
aWideChars, // wide-character buffer
|
|
nWideCharSize// size of buffer
|
|
);
|
|
if (0 == nWideCharSize ||
|
|
ERROR_NO_UNICODE_TRANSLATION == nWideCharSize)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteMultiByte() failed! (0x%x)", GetLastError());
|
|
delete [] aWideChars;
|
|
return FALSE;
|
|
}
|
|
|
|
// convert to UTF-8
|
|
const int nUTF8Size = WideCharToMultiByte (CP_UTF8, 0, aWideChars, nWideCharSize, NULL, 0, NULL, NULL);
|
|
if (0 == nUTF8Size || ERROR_INSUFFICIENT_BUFFER == nUTF8Size)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteMultiByte() failed! (WideCharToMultiByte)");
|
|
delete [] aWideChars;
|
|
return FALSE;
|
|
}
|
|
BYTE* aUTF8String = new BYTE [nUTF8Size+1];
|
|
if (NULL == aUTF8String)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteMultiByte() failed! (no memory 2)");
|
|
delete [] aWideChars;
|
|
return FALSE;
|
|
}
|
|
aUTF8String[nUTF8Size] = 0x00;
|
|
const int nWideCharToUTF8Result = WideCharToMultiByte (CP_UTF8, 0, aWideChars, nWideCharSize, (LPSTR) aUTF8String, nUTF8Size, NULL, NULL);
|
|
delete [] aWideChars;
|
|
if (0 == nWideCharToUTF8Result || ERROR_INSUFFICIENT_BUFFER == nWideCharToUTF8Result)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteMultiByte() failed! (WideCharToMultiByte)");
|
|
|
|
delete [] aUTF8String;
|
|
return FALSE;
|
|
}
|
|
|
|
BOOL bRes = WriteUTF8 ((char *) aUTF8String, nUTF8Size);
|
|
delete [] aUTF8String;
|
|
return bRes;
|
|
}
|
|
BOOL WriteUTF8 (const char *aBuffer, DWORD aSize)
|
|
{
|
|
if (NULL == m_hFile)
|
|
return FALSE;
|
|
|
|
DWORD dwWritten (0);
|
|
const BOOL bRes = ::WriteFile (m_hFile, aBuffer, aSize, &dwWritten, NULL);
|
|
if (0 == bRes)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteUTF8() failed! (0x%x)", GetLastError());
|
|
}
|
|
return bRes;
|
|
}
|
|
BOOL WriteUnicode (CStringW aBuffer)
|
|
{
|
|
if (NULL == m_hFile)
|
|
return FALSE;
|
|
|
|
// convert to UTF-8
|
|
LPCWSTR pBuffer = aBuffer.GetBuffer();
|
|
const int nUTF8Size = WideCharToMultiByte (CP_UTF8, 0, pBuffer, -1, NULL, 0, NULL, NULL);
|
|
if (0 == nUTF8Size || ERROR_INSUFFICIENT_BUFFER == nUTF8Size)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteUnicode() failed! (WideCharToMultiByte)");
|
|
aBuffer.ReleaseBuffer();
|
|
return FALSE;
|
|
}
|
|
BYTE* aUTF8String = new BYTE [nUTF8Size+1];
|
|
if (NULL == aUTF8String)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteUnicode() failed! (no memory 2)");
|
|
aBuffer.ReleaseBuffer();
|
|
return FALSE;
|
|
}
|
|
aUTF8String[nUTF8Size] = 0x00;
|
|
const int nWideCharToUTF8Result = WideCharToMultiByte (CP_UTF8, 0, pBuffer, -1, (LPSTR) aUTF8String, nUTF8Size, NULL, NULL);
|
|
if (0 == nWideCharToUTF8Result || ERROR_INSUFFICIENT_BUFFER == nWideCharToUTF8Result)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteUnicode() failed! (WideCharToMultiByte)");
|
|
delete [] aUTF8String;
|
|
aBuffer.ReleaseBuffer();
|
|
return FALSE;
|
|
}
|
|
aBuffer.ReleaseBuffer();
|
|
|
|
BOOL bRes = WriteUTF8 ((char *) aUTF8String, nUTF8Size);
|
|
delete [] aUTF8String;
|
|
return bRes;
|
|
}
|
|
BOOL WriteString (CString aBuffer)
|
|
{
|
|
if (NULL == m_hFile)
|
|
return FALSE;
|
|
|
|
BOOL bRes = TRUE;
|
|
|
|
#ifdef UNICODE
|
|
bRes = WriteUnicode (aBuffer);
|
|
#else
|
|
bRes = WriteMultiByte (aBuffer);
|
|
#endif
|
|
|
|
if (FALSE == bRes)
|
|
{
|
|
ATLTRACE2 ("CXHTMLWriter::WriteString() failed! (0x%x)", GetLastError());
|
|
}
|
|
return bRes;
|
|
}
|
|
|
|
virtual void CloseOutput ()
|
|
{
|
|
if (NULL != m_hFile)
|
|
{
|
|
CloseHandle (m_hFile);
|
|
m_hFile = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
BOOL CloseTagPrefix (CHTMLTagWrapper *pPrevTagWrapper, bool aCloseTag = false)
|
|
{
|
|
DWORD dwState = pPrevTagWrapper->State ();
|
|
|
|
BOOL bRes = TRUE;
|
|
|
|
// a tag is already closed
|
|
if (TAG_STATE_CLOSED & dwState)
|
|
return TRUE;
|
|
|
|
// a tag has child tags - we must only close a prefix
|
|
if (TAG_STATE_PREFIX_CLOSED & dwState)
|
|
{
|
|
}
|
|
else
|
|
{
|
|
if ((TAG_STATE_HAS_CHILDTAG & dwState) || (TAG_STATE_HAS_TEXTS & dwState))
|
|
{
|
|
// close prefix (">")
|
|
dwState |= TAG_STATE_PREFIX_CLOSED;
|
|
pPrevTagWrapper->State (dwState);
|
|
bRes &= WriteString (pcTagEnd);
|
|
}
|
|
}
|
|
|
|
// we need to close tag
|
|
if (aCloseTag)
|
|
{
|
|
if ((TAG_STATE_HAS_CHILDTAG & dwState) || (TAG_STATE_HAS_TEXTS & dwState))
|
|
{
|
|
// write close tag
|
|
CString sTagPostFix = _T("</");
|
|
sTagPostFix += pPrevTagWrapper->Tag()->getTagName();
|
|
sTagPostFix += pcTagEnd;
|
|
|
|
bRes &= WriteString (sTagPostFix);
|
|
dwState |= TAG_STATE_CLOSED;
|
|
}
|
|
else
|
|
{
|
|
// inline close
|
|
CString sTagPostFix = _T("/>");
|
|
bRes &= WriteString (sTagPostFix);
|
|
dwState |= TAG_STATE_CLOSED;
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
return TRUE;
|
|
}
|
|
|
|
// IXHTMLWriter impl
|
|
virtual DWORD ErrorCode ()
|
|
{
|
|
return m_dwErrorCode;
|
|
}
|
|
virtual void SetOutput (CString &aOut)
|
|
{
|
|
m_sOutput = aOut;
|
|
}
|
|
virtual BOOL BeginFile ()
|
|
{
|
|
// write strict xhtml
|
|
m_dwErrorCode = 0;
|
|
CloseOutput ();
|
|
m_hFile = ::CreateFile (m_sOutput, GENERIC_WRITE, FILE_SHARE_READ, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
if (INVALID_HANDLE_VALUE == m_hFile)
|
|
{
|
|
ATLTRACE2 (_T("Can't open file!! (last error: 0x%x)"), GetLastError());
|
|
m_hFile = NULL;
|
|
m_dwErrorCode = 1;
|
|
return FALSE;
|
|
}
|
|
|
|
// write header
|
|
BOOL bRes = TRUE;
|
|
bRes &= WriteString (pcXmlSubtag);
|
|
bRes &= WriteString (pcDoctypeSubtag);
|
|
bRes &= WriteString (pcHtmlTagStart);
|
|
|
|
return bRes;
|
|
}
|
|
virtual BOOL StartTag (CHTMLTagWrapper *pTagWrapper, CHTMLTagWrapper *pPrevTagWrapper = NULL)
|
|
{
|
|
if (NULL == pTagWrapper)
|
|
return FALSE;
|
|
|
|
BOOL bRes = TRUE;
|
|
|
|
// close last tag prefix
|
|
if (NULL != pPrevTagWrapper)
|
|
{
|
|
// close prefix in prev tag or close tag
|
|
DWORD dwState = pPrevTagWrapper->State ();
|
|
dwState |= TAG_STATE_HAS_CHILDTAG;
|
|
pPrevTagWrapper->State (dwState);
|
|
CloseTagPrefix (pPrevTagWrapper);
|
|
}
|
|
|
|
CLiteHTMLTag *pTag = pTagWrapper->Tag();
|
|
if (NULL == pTag)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
DWORD dwState = TAG_STATE_OPENED;
|
|
|
|
// write tag prefix
|
|
CString sTagPrefix = _T("<");
|
|
sTagPrefix += pTag->getTagName();
|
|
bRes &= WriteString (sTagPrefix);
|
|
|
|
// write attributes
|
|
const CLiteHTMLAttributes* pAttrs = pTag->getAttributes();
|
|
if (NULL != pAttrs)
|
|
{
|
|
const int iAttrCount = pAttrs->getCount();
|
|
if (iAttrCount > 0)
|
|
{
|
|
sTagPrefix = _T(" ");
|
|
bRes &= WriteString (sTagPrefix);
|
|
dwState |= TAG_STATE_HAS_ATTR;
|
|
}
|
|
for (int iAttr = 0; iAttr < iAttrCount; iAttr++)
|
|
{
|
|
// write attr
|
|
CLiteHTMLElemAttr& oAttr = pAttrs->getAttribute(iAttr);
|
|
CString& sAttrName = oAttr.getName();
|
|
CString& sAttrValue = oAttr.getValue ();
|
|
|
|
CString sAttrPair = sAttrName;
|
|
sAttrPair += _T("=\"");
|
|
sAttrPair += sAttrValue;
|
|
sAttrPair += _T("\" ");
|
|
bRes &= WriteString (sAttrPair);
|
|
|
|
//
|
|
}
|
|
}
|
|
pTagWrapper->State (dwState);
|
|
|
|
return bRes;
|
|
}
|
|
virtual BOOL EndTag (CHTMLTagWrapper *pTagWrapper)
|
|
{
|
|
if (NULL == pTagWrapper)
|
|
return FALSE;
|
|
|
|
BOOL bRes = TRUE;
|
|
|
|
/*
|
|
CLiteHTMLTag *pTag = pTagWrapper->Tag();
|
|
if (NULL == pTag)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
// write attributes
|
|
size_t nTextsCount = pTag->getTextsCount();
|
|
if (0 == nTextsCount && pTag->IsTagInline())
|
|
{
|
|
CString sTagPostFix = _T("/>\n");
|
|
bRes &= WriteUTF8 (sTagPostFix, _tcslen (sTagPostFix));
|
|
m_bLastTagClosed = TRUE;
|
|
return bRes;
|
|
}
|
|
|
|
// WriteTexts
|
|
for (int nText = 0; nText < nTextsCount; nText++)
|
|
{
|
|
CString sText = pTag->getTextAt (nText);
|
|
if (sText == _T("92619"))
|
|
{
|
|
ATLTRACE2 ("trololo\n");
|
|
}
|
|
|
|
bRes &= WriteUTF8 (sText, _tcslen (sText));
|
|
}
|
|
*/
|
|
|
|
// close tag
|
|
bRes &= CloseTagPrefix (pTagWrapper, true);
|
|
return bRes;
|
|
}
|
|
virtual BOOL EndFile ()
|
|
{
|
|
// TODO: check all tags closed
|
|
|
|
// close html tag
|
|
BOOL bRes = TRUE;
|
|
//bRes &= WriteUTF8 (pcHtmlTagEnd, _tcslen (pcHtmlTagEnd));
|
|
CloseOutput ();
|
|
return bRes;
|
|
}
|
|
virtual void Characters (CHTMLTagWrapper *pTagWrapper, CString aText)
|
|
{
|
|
if (NULL == pTagWrapper)
|
|
return;
|
|
|
|
|
|
DWORD dwState = pTagWrapper->State ();
|
|
dwState |= TAG_STATE_HAS_TEXTS;
|
|
pTagWrapper->State (dwState);
|
|
|
|
CloseTagPrefix (pTagWrapper);
|
|
|
|
// write text
|
|
if (!aText.IsEmpty())
|
|
WriteString (aText);
|
|
}
|
|
};
|
|
|
|
|
|
|
|
// test - convert html to xml
|
|
int _tmain (int argc, TCHAR *argv[ ], TCHAR *envp[ ])
|
|
{
|
|
if (argc < 3)
|
|
{
|
|
ATLTRACE2 ("needed parameters not found!");
|
|
return -1;
|
|
}
|
|
// remove "" from begin and end of string
|
|
CString sInputFileName = argv[1];
|
|
CString sOutputFileName = argv[2];
|
|
|
|
|
|
// Opens input file for reading
|
|
HANDLE hInputFile = ::CreateFile (sInputFileName, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
if (INVALID_HANDLE_VALUE == hInputFile)
|
|
{
|
|
ATLTRACE2 ("Can't open file!! (last error: 0x%x)", GetLastError());
|
|
return -2;
|
|
}
|
|
|
|
// Create converter object
|
|
CLiteHTMLConverter oConverter;
|
|
IXHTMLWriter *pWriter = new CXHTMLWriter ();
|
|
pWriter->SetOutput (sOutputFileName);
|
|
oConverter.ToXHTML (hInputFile, pWriter);
|
|
|
|
CloseHandle (hInputFile);
|
|
delete pWriter;
|
|
|
|
|
|
return 0;
|
|
} |