mirror of
https://github.com/ONLYOFFICE/core.git
synced 2026-04-07 13:55:33 +08:00
845 lines
22 KiB
C++
845 lines
22 KiB
C++
#include "stdafx.h"
|
||
#include "LiteHTMLReader.h"
|
||
#include <exception>
|
||
#include "LiteHTMLEntityResolver.h"
|
||
#include "UnicodeTextFile.h"
|
||
|
||
|
||
#pragma warning(push, 4)
|
||
|
||
// helper - delete specific tag from html string
|
||
|
||
void CLiteHTMLReader::DeleteComments (CString &sHtml)
|
||
{
|
||
int iCommentStart (0);
|
||
while (true)
|
||
{
|
||
iCommentStart = sHtml.Find (_T("<!--"), iCommentStart);
|
||
if (-1 == iCommentStart)
|
||
break;
|
||
|
||
const int iCommentEnd = sHtml.Find (_T("-->"), iCommentStart + 4); // length of "<!--"
|
||
if (-1 != iCommentEnd)
|
||
{
|
||
sHtml.Delete (iCommentStart, iCommentEnd - iCommentStart + 3); // lenght of '-->'
|
||
}
|
||
else
|
||
{
|
||
sHtml.Delete (iCommentStart, sHtml.GetLength() - iCommentStart); // delete to end of file
|
||
}
|
||
}
|
||
}
|
||
void CLiteHTMLReader::DeleteTags (CString &sHtml, CString sTagName)
|
||
{
|
||
const int sTagLength = sTagName.GetLength();
|
||
|
||
CString sTagPrefixOpened = _T("<");
|
||
sTagPrefixOpened += sTagName; // "<tagname"
|
||
|
||
CString sTagClosed = _T("</");
|
||
sTagClosed += sTagName;
|
||
sTagClosed += _T(">");
|
||
|
||
int iScriptStart = 0;
|
||
// delete script
|
||
while (true)
|
||
{
|
||
iScriptStart = sHtml.Find (sTagPrefixOpened, iScriptStart);
|
||
if (-1 == iScriptStart)
|
||
break;
|
||
|
||
const int iScriptEnd = sHtml.Find (sTagClosed, iScriptStart + sTagLength + 1); // length of "<" + tagLenth
|
||
if (-1 == iScriptEnd)
|
||
{
|
||
const int iScriptPrefixEnd = sHtml.Find (_T(">"), iScriptStart + sTagLength + 1);
|
||
if (-1 != iScriptPrefixEnd)
|
||
{
|
||
sHtml.Delete (iScriptStart, iScriptPrefixEnd - iScriptStart);
|
||
}
|
||
else
|
||
{
|
||
sHtml.Delete (iScriptStart, sTagLength + 1); // end of a doc ?
|
||
}
|
||
}
|
||
else
|
||
{
|
||
sHtml.Delete (iScriptStart, iScriptEnd - iScriptStart + sTagLength + 2 + 1); // length of "</" + tagLenth + ">"
|
||
}
|
||
}
|
||
}
|
||
|
||
UINT CLiteHTMLReader::parseDocument(void)
|
||
{
|
||
ATLASSERT(m_lpszBuffer != NULL);
|
||
|
||
bool bAbort = false; // continue parsing or abort?
|
||
bool bIsClosingTag = false; // tag parsed is a closing tag?
|
||
bool bIsOpeningTag = false; // tag parsed is an opening tag?
|
||
bool bInsideScript = false;
|
||
|
||
CString strCharacters; // character data
|
||
CString strComment; // comment data
|
||
CString strT; // temporary storage
|
||
DWORD dwCharDataStart = 0L; // starting position of character data
|
||
DWORD dwCharDataLen = 0L; // length of character data
|
||
LONG lTemp = 0L; // temporary storage
|
||
TCHAR ch = 0; // character at current buffer position
|
||
CLiteHTMLTag oTag; // tag information
|
||
|
||
if ( (!m_lpszBuffer) || (!m_dwBufLen) )
|
||
return (0U);
|
||
|
||
// reset seek pointer to beginning
|
||
ResetSeekPointer();
|
||
|
||
// notify event handler about parsing startup
|
||
if (getEventNotify(notifyStartStop))
|
||
{
|
||
bAbort = false;
|
||
m_pEventHandler->BeginParse(m_dwAppData, bAbort);
|
||
if (bAbort) goto LEndParse;
|
||
}
|
||
|
||
// skip leading white-space characters
|
||
while (isWhiteSpace(ReadChar()))
|
||
;
|
||
|
||
TTagParsingMode aTagParsingMode = TPM_NORMAL;
|
||
|
||
ch = UngetChar();
|
||
while ((ch = ReadChar()) != NULL)
|
||
{
|
||
switch (ch)
|
||
{
|
||
|
||
// tag starting delimeter?
|
||
case _T('<'):
|
||
{
|
||
UngetChar();
|
||
|
||
strComment.Empty();
|
||
if (!parseComment(strComment))
|
||
{
|
||
bIsOpeningTag = false;
|
||
bIsClosingTag = false;
|
||
|
||
|
||
|
||
if (!parseTag(oTag, bIsOpeningTag, bIsClosingTag, aTagParsingMode))
|
||
{
|
||
++dwCharDataLen;
|
||
|
||
// manually advance buffer position
|
||
// because the last call to UngetChar()
|
||
// moved it back one character
|
||
ch = ReadChar();
|
||
|
||
break;
|
||
}
|
||
else
|
||
{
|
||
// check <script> tag
|
||
//WE ENTER IN SCRIPT MODE
|
||
if (bIsOpeningTag && (TPM_NORMAL == aTagParsingMode))
|
||
{
|
||
if (0 == oTag.getTagName().CompareNoCase(_T("script")))
|
||
{
|
||
if (!oTag.IsTagInline())
|
||
{
|
||
aTagParsingMode = TPM_SCRIPT;
|
||
}
|
||
}
|
||
else if (0 == oTag.getTagName().CompareNoCase(_T("style")))
|
||
{
|
||
if (!oTag.IsTagInline())
|
||
{
|
||
aTagParsingMode = TPM_STYLE;
|
||
}
|
||
}
|
||
else if (0 == oTag.getTagName().CompareNoCase(_T("textarea")))
|
||
{
|
||
if (!oTag.IsTagInline())
|
||
{
|
||
aTagParsingMode = TPM_TEXTAREA;
|
||
}
|
||
}
|
||
|
||
}
|
||
|
||
if (bIsClosingTag && (TPM_NORMAL != aTagParsingMode))
|
||
{
|
||
if ((TPM_SCRIPT == aTagParsingMode) && (0 == oTag.getTagName().CompareNoCase(_T("script"))))
|
||
{
|
||
aTagParsingMode = TPM_NORMAL;
|
||
}
|
||
else if ((TPM_STYLE == aTagParsingMode) && (0 == oTag.getTagName().CompareNoCase(_T("style"))))
|
||
{
|
||
aTagParsingMode = TPM_NORMAL;
|
||
}
|
||
else if ((TPM_TEXTAREA == aTagParsingMode) && (0 == oTag.getTagName().CompareNoCase(_T("textarea"))))
|
||
{
|
||
aTagParsingMode = TPM_NORMAL;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
// clear pending notifications
|
||
if ( (dwCharDataLen) || (strCharacters.GetLength()) )
|
||
{
|
||
strCharacters += CString(&m_lpszBuffer[dwCharDataStart], dwCharDataLen);
|
||
NormalizeCharacters(strCharacters);
|
||
|
||
if ( (strCharacters.GetLength()) &&
|
||
(getEventNotify(notifyCharacters)) )
|
||
{
|
||
bAbort = false;
|
||
m_pEventHandler->Characters(strCharacters, m_dwAppData, bAbort);
|
||
if (bAbort) goto LEndParse;
|
||
}
|
||
|
||
strCharacters.Empty();
|
||
}
|
||
|
||
dwCharDataLen = 0L;
|
||
dwCharDataStart = m_dwBufPos;
|
||
|
||
if (strComment.GetLength())
|
||
{
|
||
if (getEventNotify(notifyComment))
|
||
{
|
||
bAbort = false;
|
||
m_pEventHandler->Comment(strComment, m_dwAppData, bAbort);
|
||
if (bAbort) goto LEndParse;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
if ( (bIsOpeningTag) && (getEventNotify(notifyTagStart)) )
|
||
{
|
||
bAbort = false;
|
||
m_pEventHandler->StartTag(&oTag, m_dwAppData, bAbort);
|
||
if (bAbort) goto LEndParse;
|
||
}
|
||
|
||
if ( (bIsClosingTag) && (getEventNotify(notifyTagEnd)) )
|
||
{
|
||
bAbort = false;
|
||
m_pEventHandler->EndTag(&oTag, m_dwAppData, bAbort);
|
||
if (bAbort) goto LEndParse;
|
||
}
|
||
}
|
||
|
||
break;
|
||
}
|
||
|
||
// entity reference beginning delimeter?
|
||
case _T('&'):
|
||
{
|
||
UngetChar();
|
||
|
||
lTemp = 0;
|
||
if (m_bResolveEntities)
|
||
lTemp = CLiteHTMLEntityResolver::resolveEntity(&m_lpszBuffer[m_dwBufPos], ch);
|
||
|
||
if (lTemp)
|
||
{
|
||
wchar_t isSoftHyphen = L'<EFBFBD>';
|
||
if(ch == isSoftHyphen)
|
||
strCharacters += CString(&m_lpszBuffer[dwCharDataStart], dwCharDataLen);
|
||
else
|
||
strCharacters += CString(&m_lpszBuffer[dwCharDataStart], dwCharDataLen) + ch;
|
||
m_dwBufPos += lTemp;
|
||
dwCharDataStart = m_dwBufPos;
|
||
dwCharDataLen = 0L;
|
||
}
|
||
else
|
||
{
|
||
ch = ReadChar();
|
||
++dwCharDataLen;
|
||
}
|
||
|
||
break;
|
||
}
|
||
|
||
// any other character
|
||
default:
|
||
{
|
||
++dwCharDataLen;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// clear pending notifications
|
||
if ( (dwCharDataLen) || (strCharacters.GetLength()) )
|
||
{
|
||
strCharacters += CString(&m_lpszBuffer[dwCharDataStart], dwCharDataLen) + ch;
|
||
NormalizeCharacters(strCharacters);
|
||
strCharacters.TrimRight(); // explicit trailing white-space removal
|
||
|
||
if ( (strCharacters.GetLength()) &&
|
||
(getEventNotify(notifyCharacters)) )
|
||
{
|
||
bAbort = false;
|
||
m_pEventHandler->Characters(strCharacters, m_dwAppData, bAbort);
|
||
if (bAbort) goto LEndParse;
|
||
}
|
||
}
|
||
|
||
LEndParse:
|
||
// notify event handler about parsing completion
|
||
if (getEventNotify(notifyStartStop))
|
||
m_pEventHandler->EndParse(m_dwAppData, bAbort);
|
||
|
||
m_lpszBuffer = NULL;
|
||
m_dwBufLen = 0L;
|
||
return (m_dwBufPos);
|
||
}
|
||
|
||
/**
|
||
* CLiteHTMLReader::Read
|
||
* The Read method parses an HTML document from an
|
||
* in-memory string buffer and raises events defined
|
||
* in ILiteHTMLReaderEvents to notify about variours
|
||
* elements.
|
||
*
|
||
* @param lpszString - string containing HTML text to parse
|
||
*/
|
||
UINT CLiteHTMLReader::Read(LPCTSTR lpszString)
|
||
{
|
||
ATLASSERT(AtlIsValidString(lpszString));
|
||
|
||
m_dwBufLen = ::_tcslen(lpszString);
|
||
if (m_dwBufLen)
|
||
{
|
||
m_lpszBuffer = lpszString;
|
||
return (parseDocument());
|
||
}
|
||
|
||
return (0U);
|
||
}
|
||
|
||
/**
|
||
* CLiteHTMLReader::Read
|
||
* This method is similar to the Read(LPCTSTR) method,
|
||
* except that, it accepts a file HANDLE instead of
|
||
* an in-memory string buffer containing HTML text.
|
||
*
|
||
* @param hFile - file handle
|
||
*
|
||
*/
|
||
UINT CLiteHTMLReader::ReadFile(HANDLE hFile)
|
||
{
|
||
ATLASSERT(hFile != INVALID_HANDLE_VALUE);
|
||
ATLASSERT(::GetFileType(hFile) == FILE_TYPE_DISK);
|
||
|
||
HANDLE hFileMap;
|
||
LPCTSTR lpsz;
|
||
UINT nRetVal;
|
||
|
||
// determine file size
|
||
m_dwBufLen = ::GetFileSize(hFile, NULL);
|
||
if (m_dwBufLen == INVALID_FILE_SIZE)
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
|
||
" GetFileSize() failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
goto LError;
|
||
}
|
||
|
||
// calculate length, in TCHARs, of the buffer
|
||
m_dwBufLen /= sizeof(TCHAR);
|
||
if (!m_dwBufLen)
|
||
return (0U);
|
||
|
||
// create a file-mapping object for the file
|
||
hFileMap = ::CreateFileMapping(hFile, NULL, PAGE_READONLY, 0L, 0L, NULL);
|
||
if (hFileMap == NULL)
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
|
||
" CreateFileMapping() failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
goto LError;
|
||
}
|
||
|
||
// map the entire file into the address-space of the application
|
||
lpsz = (LPCTSTR)::MapViewOfFile(hFileMap, FILE_MAP_READ, 0L, 0L, 0L);
|
||
if (lpsz == NULL)
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
|
||
" MapViewOfFile() failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
goto LError;
|
||
}
|
||
|
||
m_lpszBuffer = lpsz;
|
||
nRetVal = parseDocument();
|
||
goto LCleanExit;
|
||
|
||
LError:
|
||
nRetVal = 0U;
|
||
m_dwBufLen = 0L;
|
||
|
||
LCleanExit:
|
||
if (lpsz != NULL)
|
||
ATLVERIFY(::UnmapViewOfFile(lpsz));
|
||
if (hFileMap)
|
||
ATLVERIFY(::CloseHandle(hFileMap));
|
||
return (nRetVal);
|
||
}
|
||
|
||
TBOMType CharsetStringToInt (CStringA sCharset)
|
||
{
|
||
TBOMType nBomType = TB_UNKNOWN;
|
||
if (!sCharset.IsEmpty())
|
||
{
|
||
if (0 == sCharset.CompareNoCase("utf-8"))
|
||
nBomType = TB_UTF8;
|
||
else if (0 == sCharset.CompareNoCase("windows-1250")) // central europe // single byte
|
||
nBomType = TB_WINDOWS1250;
|
||
else if (0 == sCharset.CompareNoCase("windows-1251")) // cyrillic
|
||
nBomType = TB_WINDOWS1251;
|
||
else if (0 == sCharset.CompareNoCase("windows-1252")) // western european
|
||
nBomType = TB_WINDOWS1252;
|
||
else if (0 == sCharset.CompareNoCase("windows-1253")) // greek
|
||
nBomType = TB_WINDOWS1253;
|
||
else if (0 == sCharset.CompareNoCase("windows-1254")) // turkish
|
||
nBomType = TB_WINDOWS1254;
|
||
else if (0 == sCharset.CompareNoCase("windows-1255")) // hebrew
|
||
nBomType = TB_WINDOWS1255;
|
||
else if (0 == sCharset.CompareNoCase("windows-1256")) // arabic
|
||
nBomType = TB_WINDOWS1256;
|
||
else if (0 == sCharset.CompareNoCase("windows-1257")) // baltic
|
||
nBomType = TB_WINDOWS1257;
|
||
else if (0 == sCharset.CompareNoCase("windows-1258")) // vietnamese
|
||
nBomType = TB_WINDOWS1258;
|
||
else if (0 == sCharset.CompareNoCase("windows-874")) // thai
|
||
nBomType = TB_WINDOWS874;
|
||
else if (0 == sCharset.CompareNoCase("koi8-r")) // russian koi8-r
|
||
nBomType = TB_KOI8R;
|
||
else if (0 == sCharset.CompareNoCase("iso-8859-1")) // iso-8859-1
|
||
nBomType = TB_ISO8859_1;
|
||
else if (0 == sCharset.CompareNoCase("shift_jis")) // japan shit-jis
|
||
nBomType = TB_SHIFT_JIS;
|
||
}
|
||
return nBomType;
|
||
}
|
||
// codepage detect;
|
||
TBOMType CLiteHTMLReader::DetectCodePage (HANDLE hFile, int& nBomSize)
|
||
{
|
||
TBOMType nBomType (TB_UNKNOWN);
|
||
BYTE *pBuffer = NULL;
|
||
|
||
nBomSize = 0;
|
||
|
||
try
|
||
{
|
||
// seek to begin of file
|
||
if (INVALID_SET_FILE_POINTER == ::SetFilePointer (hFile, 0, NULL, FILE_BEGIN))
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::DetectCodePage:"
|
||
" ::SetFilePointer failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
throw (UINT) -1;
|
||
}
|
||
|
||
nBomType = GetBomType (hFile, nBomSize);
|
||
if (TB_UNKNOWN != nBomType)
|
||
{
|
||
return nBomType;
|
||
}
|
||
|
||
// charset is not detected by bom. find it from meta tag
|
||
CStringA sCharset;
|
||
|
||
if (INVALID_SET_FILE_POINTER == ::SetFilePointer (hFile, 0, NULL, FILE_BEGIN))
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::DetectCodePage:"
|
||
" ::SetFilePointer failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
throw (UINT) -1;
|
||
}
|
||
|
||
// determine buffer size
|
||
const DWORD dwFileSize = ::GetFileSize (hFile, NULL);
|
||
const DWORD dwBufferSize = min (dwFileSize, 1 * 1024 * 1024); // load 4 bytes
|
||
CStringA sBuffer;
|
||
pBuffer = reinterpret_cast <BYTE *> (sBuffer.GetBuffer(dwBufferSize));
|
||
|
||
// checking multibyte codepage. loading 1 MB of data.
|
||
DWORD dwRead (0);
|
||
BOOL bReadRes = ::ReadFile (hFile, pBuffer, dwBufferSize, &dwRead, NULL);
|
||
if ((0 == bReadRes) || (dwBufferSize != dwRead))
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::DetectCodePage:"
|
||
" ::ReadFile(Buffer) failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
throw (UINT) -1;
|
||
}
|
||
sBuffer.ReleaseBuffer (dwBufferSize);
|
||
sBuffer = sBuffer.MakeLower ();
|
||
|
||
// seek to the beginning of file
|
||
if (INVALID_SET_FILE_POINTER == ::SetFilePointer (hFile, 0, NULL, FILE_BEGIN))
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::DetectCodePage:"
|
||
" ::SetFilePointer failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
throw (UINT) -1;
|
||
}
|
||
|
||
// find <meta> tag from beginning of file (usually it situated in the top of document)
|
||
int nMetaStart = sBuffer.Find ("<meta");
|
||
|
||
while (-1 != nMetaStart)
|
||
{
|
||
nMetaStart += 5; // strlen ("<meta")
|
||
|
||
// find closing '>'
|
||
int nMetaEnd = sBuffer.Find (">", nMetaStart); // strlen ("<meta") = 5
|
||
//char *pMetaEnd = strstr (pMetaStart, ">");
|
||
if (-1 == nMetaEnd)
|
||
{
|
||
// meta is not closed
|
||
break;
|
||
}
|
||
nMetaEnd++; // strlen (">")
|
||
|
||
// ok, let find charset
|
||
CStringA sMetaContent = sBuffer.Mid (nMetaStart, nMetaEnd - nMetaStart);
|
||
|
||
sMetaContent = sMetaContent.MakeLower ();
|
||
const int nHttpEquiv = sMetaContent.Find ("http-equiv=");
|
||
const int nContent = sMetaContent.Find ("content=");
|
||
|
||
if (-1 != nHttpEquiv && -1 != nContent)
|
||
{
|
||
// check "http-equiv" attribute
|
||
const bool bHttpEquivContentType = (-1 != sMetaContent.Find("content-type", nHttpEquiv + 11));
|
||
// parse "content" attribute
|
||
if (bHttpEquivContentType)
|
||
{
|
||
|
||
const int nCharset = sMetaContent.Find ("charset=", nContent + 8);
|
||
if (-1 != nCharset)
|
||
{
|
||
const int nCharsetEnd = sMetaContent.Find ("\"", nCharset + 8);
|
||
|
||
if (-1 != nCharsetEnd)
|
||
{
|
||
sCharset = sMetaContent.Mid (nCharset + 8, nCharsetEnd - nCharset - 8);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!sCharset.IsEmpty())
|
||
{
|
||
nBomType = CharsetStringToInt (sCharset);
|
||
break;
|
||
}
|
||
|
||
// find next meta tag
|
||
nMetaStart = sBuffer.Find ("<meta", nMetaEnd);
|
||
}
|
||
|
||
if (TB_UNKNOWN == nBomType)
|
||
{
|
||
const int nXmlStart = sBuffer.Find ("<?");
|
||
if (-1 != nXmlStart)
|
||
{
|
||
const int nXmlEnd = sBuffer.Find ('>', nXmlStart + 2);
|
||
if (-1 != nXmlEnd)
|
||
{
|
||
// string like 'xml version='1.0' encoding="UTF-8"'
|
||
CStringA sXmlNodeString = sBuffer.Mid (nXmlStart + 2, nXmlEnd - nXmlStart - 2);
|
||
int nWordStart = sXmlNodeString.Find ("encoding");
|
||
if (-1 != nWordStart)
|
||
{
|
||
nWordStart = sXmlNodeString.Find ('\"', nWordStart + 9);
|
||
if (-1 != nWordStart)
|
||
{
|
||
sXmlNodeString = sXmlNodeString.Mid (nWordStart + 1);
|
||
nWordStart = sXmlNodeString.Find ('\"');
|
||
if (-1 != nWordStart)
|
||
{
|
||
sXmlNodeString = sXmlNodeString.Left (nWordStart);
|
||
nBomType = CharsetStringToInt (sXmlNodeString);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
catch (UINT eRes)
|
||
{
|
||
eRes;
|
||
nBomType = TB_ERROR;
|
||
}
|
||
|
||
return nBomType;
|
||
}
|
||
|
||
//
|
||
UINT CLiteHTMLReader::ReadFile2 (HANDLE hFile, UINT aCodePage)
|
||
{
|
||
ATLASSERT(hFile != INVALID_HANDLE_VALUE);
|
||
ATLASSERT(::GetFileType(hFile) == FILE_TYPE_DISK);
|
||
|
||
int nBomSize (0);
|
||
TBOMType nCodePage = DetectCodePage (hFile, nBomSize);
|
||
|
||
HANDLE hFileMap (NULL);
|
||
LPCSTR lpsz (NULL);
|
||
UINT nRetVal (0);
|
||
CString sHtml;
|
||
|
||
try
|
||
{
|
||
// determine file size
|
||
m_dwBufLen = ::GetFileSize(hFile, NULL);
|
||
if (m_dwBufLen == INVALID_FILE_SIZE)
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
|
||
" GetFileSize() failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
throw;
|
||
}
|
||
|
||
if (0 == m_dwBufLen)
|
||
return (0U);
|
||
|
||
// create a file-mapping object for the file
|
||
hFileMap = ::CreateFileMapping(hFile, NULL, PAGE_READONLY, 0L, 0L, NULL);
|
||
if (hFileMap == NULL)
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
|
||
" CreateFileMapping() failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
throw;
|
||
}
|
||
|
||
// map the entire file into the address-space of the application
|
||
lpsz = (LPCSTR)::MapViewOfFile(hFileMap, FILE_MAP_READ, 0L, 0L, 0L);
|
||
if (lpsz == NULL)
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
|
||
" MapViewOfFile() failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
throw;
|
||
}
|
||
|
||
// skip BOM
|
||
lpsz += nBomSize;
|
||
m_dwBufLen -= nBomSize;
|
||
|
||
|
||
bool bSimpleConvertToUnicode = true;
|
||
switch (nCodePage)
|
||
{
|
||
case TB_UNKNOWN:
|
||
//aCodePage = aCodePage; // do nothing (default code page passed through func params)
|
||
break;
|
||
case TB_UTF7:
|
||
aCodePage = CP_UTF7;
|
||
break;
|
||
case TB_UTF8:
|
||
aCodePage = CP_UTF8;
|
||
break;
|
||
case TB_WINDOWS1250:
|
||
aCodePage = 1250;
|
||
break;
|
||
case TB_WINDOWS1251:
|
||
aCodePage = 1251;
|
||
break;
|
||
case TB_WINDOWS1252:
|
||
case TB_ISO8859_1: // html5 says that
|
||
aCodePage = 1252;
|
||
break;
|
||
case TB_WINDOWS1253:
|
||
aCodePage = 1253;
|
||
break;
|
||
case TB_WINDOWS1254:
|
||
aCodePage = 1254;
|
||
break;
|
||
case TB_WINDOWS1255:
|
||
aCodePage = 1255;
|
||
break;
|
||
case TB_WINDOWS1256:
|
||
aCodePage = 1256;
|
||
break;
|
||
case TB_WINDOWS1257:
|
||
aCodePage = 1257;
|
||
break;
|
||
case TB_WINDOWS1258:
|
||
aCodePage = 1258;
|
||
break;
|
||
case TB_WINDOWS874:
|
||
aCodePage = 874;
|
||
break;
|
||
case TB_SHIFT_JIS:
|
||
aCodePage = 932;
|
||
break;
|
||
default:
|
||
// need complex conversion
|
||
bSimpleConvertToUnicode = false;
|
||
break;
|
||
}
|
||
|
||
if (bSimpleConvertToUnicode)
|
||
{
|
||
BOOL bRes = ConvertMultibyteToUnicode (aCodePage, lpsz, m_dwBufLen, sHtml);
|
||
if (!bRes)
|
||
throw;
|
||
}
|
||
else
|
||
{
|
||
if (TB_KOI8R == nCodePage)
|
||
{
|
||
BOOL bRes = ConvertKoi8RToUnicode (lpsz, m_dwBufLen, sHtml);
|
||
if (!bRes)
|
||
throw;
|
||
}
|
||
else if (TB_UTF16LE == nCodePage)
|
||
{
|
||
const int nSizeInChars = m_dwBufLen / sizeof (WCHAR);
|
||
WCHAR *pwcBuffer = sHtml.GetBufferSetLength (nSizeInChars);
|
||
if (NULL != pwcBuffer)
|
||
{
|
||
::memcpy (pwcBuffer, lpsz, nSizeInChars * sizeof (WCHAR)); // even quantity of chars
|
||
sHtml.ReleaseBuffer ();
|
||
}
|
||
}
|
||
else if (TB_UTF16BE == nCodePage)
|
||
{
|
||
const int nSizeInChars = m_dwBufLen / sizeof (WCHAR);
|
||
WCHAR *pwcBuffer = sHtml.GetBufferSetLength (nSizeInChars);
|
||
if (NULL != pwcBuffer)
|
||
{
|
||
for (int nChar = 0; nChar < nSizeInChars; nChar ++)
|
||
{
|
||
pwcBuffer[nChar] = lpsz[(nChar << 1) + 1] | (lpsz[nChar << 1] << 8);
|
||
}
|
||
sHtml.ReleaseBuffer ();
|
||
}
|
||
}
|
||
}
|
||
|
||
// delete script
|
||
DeleteTags (sHtml, _T("script"));
|
||
DeleteTags (sHtml, _T("SCRIPT"));
|
||
DeleteComments (sHtml);
|
||
|
||
|
||
m_lpszBuffer = sHtml.GetBuffer();
|
||
nRetVal = parseDocument ();
|
||
sHtml.ReleaseBuffer();
|
||
}
|
||
catch (...)
|
||
{
|
||
nRetVal = 0U;
|
||
m_dwBufLen = 0L;
|
||
}
|
||
|
||
if (lpsz != NULL)
|
||
ATLVERIFY(::UnmapViewOfFile(lpsz));
|
||
if (hFileMap)
|
||
ATLVERIFY(::CloseHandle(hFileMap));
|
||
return (nRetVal);
|
||
}
|
||
|
||
// static
|
||
BOOL CLiteHTMLReader::ConvertKoi8RToUnicode (const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw()
|
||
{
|
||
ATLASSERT (aInput);
|
||
ATLASSERT (aInputSize);
|
||
|
||
// rfc-1489
|
||
static const USHORT koi82unicode[128] = {
|
||
0x2500,0x2502,0x250c,0x2510,0x2514,0x2518,0x251c,0x2524,
|
||
0x252c,0x2534,0x253c,0x2580,0x2584,0x2588,0x258c,0x2590,
|
||
0x2591,0x2592,0x2593,0x2320,0x25a0,0x2219,0x221a,0x2248,
|
||
0x2264,0x2265,0x00a0,0x2321,0x00b0,0x00b2,0x00b7,0x00f7,
|
||
0x2550,0x2551,0x2552,0x0451,0x2553,0x2554,0x2555,0x2556,
|
||
0x2557,0x2558,0x2559,0x255a,0x255b,0x255c,0x255d,0x255e,
|
||
0x255f,0x2560,0x2561,0x0401,0x2562,0x2563,0x2564,0x2565,
|
||
0x2566,0x2567,0x2568,0x2569,0x256a,0x256b,0x256c,0x00a9,
|
||
0x044e,0x0430,0x0431,0x0446,0x0434,0x0435,0x0444,0x0433,
|
||
0x0445,0x0438,0x0439,0x043a,0x043b,0x043c,0x043d,0x043e,
|
||
0x043f,0x044f,0x0440,0x0441,0x0442,0x0443,0x0436,0x0432,
|
||
0x044c,0x044b,0x0437,0x0448,0x044d,0x0449,0x0447,0x044a,
|
||
0x042e,0x0410,0x0411,0x0426,0x0414,0x0415,0x0424,0x0413,
|
||
0x0425,0x0418,0x0419,0x041a,0x041b,0x041c,0x041d,0x041e,
|
||
0x041f,0x042f,0x0420,0x0421,0x0422,0x0423,0x0416,0x0412,
|
||
0x042c,0x042b,0x0417,0x0428,0x042d,0x0429,0x0427,0x042a
|
||
};
|
||
//aOut.Preallocate (aInputSize);
|
||
WCHAR *pUnicodeBuffer = aOut.GetBufferSetLength (aInputSize);
|
||
|
||
if (NULL == pUnicodeBuffer)
|
||
return FALSE;
|
||
|
||
for (int nChar = 0; nChar < aInputSize; nChar++)
|
||
{
|
||
const BYTE cKoi8Char = BYTE (aInput[nChar]);
|
||
const USHORT wcUnicodeChar = cKoi8Char < 128 ? cKoi8Char : koi82unicode [cKoi8Char - 128];
|
||
pUnicodeBuffer[nChar] = (WCHAR) wcUnicodeChar;
|
||
//aOut.SetAt(nChar, (WCHAR) wcUnicodeChar);
|
||
}
|
||
aOut.ReleaseBuffer ();
|
||
|
||
return TRUE;
|
||
}
|
||
/*
|
||
// static
|
||
BOOL CLiteHTMLReader::ConvertMultibyteToUnicode (const UINT aCodePage, const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw()
|
||
{
|
||
ATLASSERT (aInput);
|
||
ATLASSERT (aInputSize);
|
||
|
||
const int nUnicodeSize = MultiByteToWideChar (aCodePage, 0, aInput, aInputSize, NULL, 0);
|
||
|
||
WCHAR *pwStr = NULL;
|
||
if (0 != nUnicodeSize)
|
||
{
|
||
try
|
||
{
|
||
pwStr = new WCHAR [nUnicodeSize];
|
||
}
|
||
catch (std::bad_alloc& ba)
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::ConvertMultibyteToUnicode:"
|
||
"new[] failed;"
|
||
" exception: %s\n", ba.what ());
|
||
}
|
||
if (NULL == pwStr)
|
||
{
|
||
ATLTRACE2("(Error) CLiteHTMLReader::ConvertMultibyteToUnicode:"
|
||
" new failed;"
|
||
" GetLastError() returns 0x%08x.\n", ::GetLastError());
|
||
|
||
return FALSE;
|
||
}
|
||
|
||
const int nStrSize = ::MultiByteToWideChar (aCodePage, 0, aInput, aInputSize, pwStr, nUnicodeSize);
|
||
ATLTRACE2 ("CLiteHTMLReader::ConvertMultibyteToUnicode(): %i bytes has been converted to unicode\n", nStrSize);
|
||
}
|
||
|
||
// Preprocess
|
||
|
||
if (NULL != pwStr)
|
||
{
|
||
aOut = pwStr;
|
||
delete [] pwStr;
|
||
pwStr = NULL;
|
||
return TRUE;
|
||
}
|
||
return FALSE;
|
||
}
|
||
*/
|
||
#pragma warning(pop)
|
||
|