Files
core/ASCOfficeHtmlFile_SaxHtmlParser/HTMLReaderLib/LiteHTMLReader.cpp

845 lines
22 KiB
C++
Raw Blame History

#include "stdafx.h"
#include "LiteHTMLReader.h"
#include <exception>
#include "LiteHTMLEntityResolver.h"
#include "UnicodeTextFile.h"
#pragma warning(push, 4)
// helper - delete specific tag from html string
void CLiteHTMLReader::DeleteComments (CString &sHtml)
{
int iCommentStart (0);
while (true)
{
iCommentStart = sHtml.Find (_T("<!--"), iCommentStart);
if (-1 == iCommentStart)
break;
const int iCommentEnd = sHtml.Find (_T("-->"), iCommentStart + 4); // length of "<!--"
if (-1 != iCommentEnd)
{
sHtml.Delete (iCommentStart, iCommentEnd - iCommentStart + 3); // lenght of '-->'
}
else
{
sHtml.Delete (iCommentStart, sHtml.GetLength() - iCommentStart); // delete to end of file
}
}
}
void CLiteHTMLReader::DeleteTags (CString &sHtml, CString sTagName)
{
const int sTagLength = sTagName.GetLength();
CString sTagPrefixOpened = _T("<");
sTagPrefixOpened += sTagName; // "<tagname"
CString sTagClosed = _T("</");
sTagClosed += sTagName;
sTagClosed += _T(">");
int iScriptStart = 0;
// delete script
while (true)
{
iScriptStart = sHtml.Find (sTagPrefixOpened, iScriptStart);
if (-1 == iScriptStart)
break;
const int iScriptEnd = sHtml.Find (sTagClosed, iScriptStart + sTagLength + 1); // length of "<" + tagLenth
if (-1 == iScriptEnd)
{
const int iScriptPrefixEnd = sHtml.Find (_T(">"), iScriptStart + sTagLength + 1);
if (-1 != iScriptPrefixEnd)
{
sHtml.Delete (iScriptStart, iScriptPrefixEnd - iScriptStart);
}
else
{
sHtml.Delete (iScriptStart, sTagLength + 1); // end of a doc ?
}
}
else
{
sHtml.Delete (iScriptStart, iScriptEnd - iScriptStart + sTagLength + 2 + 1); // length of "</" + tagLenth + ">"
}
}
}
UINT CLiteHTMLReader::parseDocument(void)
{
ATLASSERT(m_lpszBuffer != NULL);
bool bAbort = false; // continue parsing or abort?
bool bIsClosingTag = false; // tag parsed is a closing tag?
bool bIsOpeningTag = false; // tag parsed is an opening tag?
bool bInsideScript = false;
CString strCharacters; // character data
CString strComment; // comment data
CString strT; // temporary storage
DWORD dwCharDataStart = 0L; // starting position of character data
DWORD dwCharDataLen = 0L; // length of character data
LONG lTemp = 0L; // temporary storage
TCHAR ch = 0; // character at current buffer position
CLiteHTMLTag oTag; // tag information
if ( (!m_lpszBuffer) || (!m_dwBufLen) )
return (0U);
// reset seek pointer to beginning
ResetSeekPointer();
// notify event handler about parsing startup
if (getEventNotify(notifyStartStop))
{
bAbort = false;
m_pEventHandler->BeginParse(m_dwAppData, bAbort);
if (bAbort) goto LEndParse;
}
// skip leading white-space characters
while (isWhiteSpace(ReadChar()))
;
TTagParsingMode aTagParsingMode = TPM_NORMAL;
ch = UngetChar();
while ((ch = ReadChar()) != NULL)
{
switch (ch)
{
// tag starting delimeter?
case _T('<'):
{
UngetChar();
strComment.Empty();
if (!parseComment(strComment))
{
bIsOpeningTag = false;
bIsClosingTag = false;
if (!parseTag(oTag, bIsOpeningTag, bIsClosingTag, aTagParsingMode))
{
++dwCharDataLen;
// manually advance buffer position
// because the last call to UngetChar()
// moved it back one character
ch = ReadChar();
break;
}
else
{
// check <script> tag
//WE ENTER IN SCRIPT MODE
if (bIsOpeningTag && (TPM_NORMAL == aTagParsingMode))
{
if (0 == oTag.getTagName().CompareNoCase(_T("script")))
{
if (!oTag.IsTagInline())
{
aTagParsingMode = TPM_SCRIPT;
}
}
else if (0 == oTag.getTagName().CompareNoCase(_T("style")))
{
if (!oTag.IsTagInline())
{
aTagParsingMode = TPM_STYLE;
}
}
else if (0 == oTag.getTagName().CompareNoCase(_T("textarea")))
{
if (!oTag.IsTagInline())
{
aTagParsingMode = TPM_TEXTAREA;
}
}
}
if (bIsClosingTag && (TPM_NORMAL != aTagParsingMode))
{
if ((TPM_SCRIPT == aTagParsingMode) && (0 == oTag.getTagName().CompareNoCase(_T("script"))))
{
aTagParsingMode = TPM_NORMAL;
}
else if ((TPM_STYLE == aTagParsingMode) && (0 == oTag.getTagName().CompareNoCase(_T("style"))))
{
aTagParsingMode = TPM_NORMAL;
}
else if ((TPM_TEXTAREA == aTagParsingMode) && (0 == oTag.getTagName().CompareNoCase(_T("textarea"))))
{
aTagParsingMode = TPM_NORMAL;
}
}
}
}
// clear pending notifications
if ( (dwCharDataLen) || (strCharacters.GetLength()) )
{
strCharacters += CString(&m_lpszBuffer[dwCharDataStart], dwCharDataLen);
NormalizeCharacters(strCharacters);
if ( (strCharacters.GetLength()) &&
(getEventNotify(notifyCharacters)) )
{
bAbort = false;
m_pEventHandler->Characters(strCharacters, m_dwAppData, bAbort);
if (bAbort) goto LEndParse;
}
strCharacters.Empty();
}
dwCharDataLen = 0L;
dwCharDataStart = m_dwBufPos;
if (strComment.GetLength())
{
if (getEventNotify(notifyComment))
{
bAbort = false;
m_pEventHandler->Comment(strComment, m_dwAppData, bAbort);
if (bAbort) goto LEndParse;
}
}
else
{
if ( (bIsOpeningTag) && (getEventNotify(notifyTagStart)) )
{
bAbort = false;
m_pEventHandler->StartTag(&oTag, m_dwAppData, bAbort);
if (bAbort) goto LEndParse;
}
if ( (bIsClosingTag) && (getEventNotify(notifyTagEnd)) )
{
bAbort = false;
m_pEventHandler->EndTag(&oTag, m_dwAppData, bAbort);
if (bAbort) goto LEndParse;
}
}
break;
}
// entity reference beginning delimeter?
case _T('&'):
{
UngetChar();
lTemp = 0;
if (m_bResolveEntities)
lTemp = CLiteHTMLEntityResolver::resolveEntity(&m_lpszBuffer[m_dwBufPos], ch);
if (lTemp)
{
wchar_t isSoftHyphen = L'<EFBFBD>';
if(ch == isSoftHyphen)
strCharacters += CString(&m_lpszBuffer[dwCharDataStart], dwCharDataLen);
else
strCharacters += CString(&m_lpszBuffer[dwCharDataStart], dwCharDataLen) + ch;
m_dwBufPos += lTemp;
dwCharDataStart = m_dwBufPos;
dwCharDataLen = 0L;
}
else
{
ch = ReadChar();
++dwCharDataLen;
}
break;
}
// any other character
default:
{
++dwCharDataLen;
break;
}
}
}
// clear pending notifications
if ( (dwCharDataLen) || (strCharacters.GetLength()) )
{
strCharacters += CString(&m_lpszBuffer[dwCharDataStart], dwCharDataLen) + ch;
NormalizeCharacters(strCharacters);
strCharacters.TrimRight(); // explicit trailing white-space removal
if ( (strCharacters.GetLength()) &&
(getEventNotify(notifyCharacters)) )
{
bAbort = false;
m_pEventHandler->Characters(strCharacters, m_dwAppData, bAbort);
if (bAbort) goto LEndParse;
}
}
LEndParse:
// notify event handler about parsing completion
if (getEventNotify(notifyStartStop))
m_pEventHandler->EndParse(m_dwAppData, bAbort);
m_lpszBuffer = NULL;
m_dwBufLen = 0L;
return (m_dwBufPos);
}
/**
* CLiteHTMLReader::Read
* The Read method parses an HTML document from an
* in-memory string buffer and raises events defined
* in ILiteHTMLReaderEvents to notify about variours
* elements.
*
* @param lpszString - string containing HTML text to parse
*/
UINT CLiteHTMLReader::Read(LPCTSTR lpszString)
{
ATLASSERT(AtlIsValidString(lpszString));
m_dwBufLen = ::_tcslen(lpszString);
if (m_dwBufLen)
{
m_lpszBuffer = lpszString;
return (parseDocument());
}
return (0U);
}
/**
* CLiteHTMLReader::Read
* This method is similar to the Read(LPCTSTR) method,
* except that, it accepts a file HANDLE instead of
* an in-memory string buffer containing HTML text.
*
* @param hFile - file handle
*
*/
UINT CLiteHTMLReader::ReadFile(HANDLE hFile)
{
ATLASSERT(hFile != INVALID_HANDLE_VALUE);
ATLASSERT(::GetFileType(hFile) == FILE_TYPE_DISK);
HANDLE hFileMap;
LPCTSTR lpsz;
UINT nRetVal;
// determine file size
m_dwBufLen = ::GetFileSize(hFile, NULL);
if (m_dwBufLen == INVALID_FILE_SIZE)
{
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
" GetFileSize() failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
goto LError;
}
// calculate length, in TCHARs, of the buffer
m_dwBufLen /= sizeof(TCHAR);
if (!m_dwBufLen)
return (0U);
// create a file-mapping object for the file
hFileMap = ::CreateFileMapping(hFile, NULL, PAGE_READONLY, 0L, 0L, NULL);
if (hFileMap == NULL)
{
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
" CreateFileMapping() failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
goto LError;
}
// map the entire file into the address-space of the application
lpsz = (LPCTSTR)::MapViewOfFile(hFileMap, FILE_MAP_READ, 0L, 0L, 0L);
if (lpsz == NULL)
{
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
" MapViewOfFile() failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
goto LError;
}
m_lpszBuffer = lpsz;
nRetVal = parseDocument();
goto LCleanExit;
LError:
nRetVal = 0U;
m_dwBufLen = 0L;
LCleanExit:
if (lpsz != NULL)
ATLVERIFY(::UnmapViewOfFile(lpsz));
if (hFileMap)
ATLVERIFY(::CloseHandle(hFileMap));
return (nRetVal);
}
TBOMType CharsetStringToInt (CStringA sCharset)
{
TBOMType nBomType = TB_UNKNOWN;
if (!sCharset.IsEmpty())
{
if (0 == sCharset.CompareNoCase("utf-8"))
nBomType = TB_UTF8;
else if (0 == sCharset.CompareNoCase("windows-1250")) // central europe // single byte
nBomType = TB_WINDOWS1250;
else if (0 == sCharset.CompareNoCase("windows-1251")) // cyrillic
nBomType = TB_WINDOWS1251;
else if (0 == sCharset.CompareNoCase("windows-1252")) // western european
nBomType = TB_WINDOWS1252;
else if (0 == sCharset.CompareNoCase("windows-1253")) // greek
nBomType = TB_WINDOWS1253;
else if (0 == sCharset.CompareNoCase("windows-1254")) // turkish
nBomType = TB_WINDOWS1254;
else if (0 == sCharset.CompareNoCase("windows-1255")) // hebrew
nBomType = TB_WINDOWS1255;
else if (0 == sCharset.CompareNoCase("windows-1256")) // arabic
nBomType = TB_WINDOWS1256;
else if (0 == sCharset.CompareNoCase("windows-1257")) // baltic
nBomType = TB_WINDOWS1257;
else if (0 == sCharset.CompareNoCase("windows-1258")) // vietnamese
nBomType = TB_WINDOWS1258;
else if (0 == sCharset.CompareNoCase("windows-874")) // thai
nBomType = TB_WINDOWS874;
else if (0 == sCharset.CompareNoCase("koi8-r")) // russian koi8-r
nBomType = TB_KOI8R;
else if (0 == sCharset.CompareNoCase("iso-8859-1")) // iso-8859-1
nBomType = TB_ISO8859_1;
else if (0 == sCharset.CompareNoCase("shift_jis")) // japan shit-jis
nBomType = TB_SHIFT_JIS;
}
return nBomType;
}
// codepage detect;
TBOMType CLiteHTMLReader::DetectCodePage (HANDLE hFile, int& nBomSize)
{
TBOMType nBomType (TB_UNKNOWN);
BYTE *pBuffer = NULL;
nBomSize = 0;
try
{
// seek to begin of file
if (INVALID_SET_FILE_POINTER == ::SetFilePointer (hFile, 0, NULL, FILE_BEGIN))
{
ATLTRACE2("(Error) CLiteHTMLReader::DetectCodePage:"
" ::SetFilePointer failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
throw (UINT) -1;
}
nBomType = GetBomType (hFile, nBomSize);
if (TB_UNKNOWN != nBomType)
{
return nBomType;
}
// charset is not detected by bom. find it from meta tag
CStringA sCharset;
if (INVALID_SET_FILE_POINTER == ::SetFilePointer (hFile, 0, NULL, FILE_BEGIN))
{
ATLTRACE2("(Error) CLiteHTMLReader::DetectCodePage:"
" ::SetFilePointer failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
throw (UINT) -1;
}
// determine buffer size
const DWORD dwFileSize = ::GetFileSize (hFile, NULL);
const DWORD dwBufferSize = min (dwFileSize, 1 * 1024 * 1024); // load 4 bytes
CStringA sBuffer;
pBuffer = reinterpret_cast <BYTE *> (sBuffer.GetBuffer(dwBufferSize));
// checking multibyte codepage. loading 1 MB of data.
DWORD dwRead (0);
BOOL bReadRes = ::ReadFile (hFile, pBuffer, dwBufferSize, &dwRead, NULL);
if ((0 == bReadRes) || (dwBufferSize != dwRead))
{
ATLTRACE2("(Error) CLiteHTMLReader::DetectCodePage:"
" ::ReadFile(Buffer) failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
throw (UINT) -1;
}
sBuffer.ReleaseBuffer (dwBufferSize);
sBuffer = sBuffer.MakeLower ();
// seek to the beginning of file
if (INVALID_SET_FILE_POINTER == ::SetFilePointer (hFile, 0, NULL, FILE_BEGIN))
{
ATLTRACE2("(Error) CLiteHTMLReader::DetectCodePage:"
" ::SetFilePointer failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
throw (UINT) -1;
}
// find <meta> tag from beginning of file (usually it situated in the top of document)
int nMetaStart = sBuffer.Find ("<meta");
while (-1 != nMetaStart)
{
nMetaStart += 5; // strlen ("<meta")
// find closing '>'
int nMetaEnd = sBuffer.Find (">", nMetaStart); // strlen ("<meta") = 5
//char *pMetaEnd = strstr (pMetaStart, ">");
if (-1 == nMetaEnd)
{
// meta is not closed
break;
}
nMetaEnd++; // strlen (">")
// ok, let find charset
CStringA sMetaContent = sBuffer.Mid (nMetaStart, nMetaEnd - nMetaStart);
sMetaContent = sMetaContent.MakeLower ();
const int nHttpEquiv = sMetaContent.Find ("http-equiv=");
const int nContent = sMetaContent.Find ("content=");
if (-1 != nHttpEquiv && -1 != nContent)
{
// check "http-equiv" attribute
const bool bHttpEquivContentType = (-1 != sMetaContent.Find("content-type", nHttpEquiv + 11));
// parse "content" attribute
if (bHttpEquivContentType)
{
const int nCharset = sMetaContent.Find ("charset=", nContent + 8);
if (-1 != nCharset)
{
const int nCharsetEnd = sMetaContent.Find ("\"", nCharset + 8);
if (-1 != nCharsetEnd)
{
sCharset = sMetaContent.Mid (nCharset + 8, nCharsetEnd - nCharset - 8);
}
}
}
}
if (!sCharset.IsEmpty())
{
nBomType = CharsetStringToInt (sCharset);
break;
}
// find next meta tag
nMetaStart = sBuffer.Find ("<meta", nMetaEnd);
}
if (TB_UNKNOWN == nBomType)
{
const int nXmlStart = sBuffer.Find ("<?");
if (-1 != nXmlStart)
{
const int nXmlEnd = sBuffer.Find ('>', nXmlStart + 2);
if (-1 != nXmlEnd)
{
// string like 'xml version='1.0' encoding="UTF-8"'
CStringA sXmlNodeString = sBuffer.Mid (nXmlStart + 2, nXmlEnd - nXmlStart - 2);
int nWordStart = sXmlNodeString.Find ("encoding");
if (-1 != nWordStart)
{
nWordStart = sXmlNodeString.Find ('\"', nWordStart + 9);
if (-1 != nWordStart)
{
sXmlNodeString = sXmlNodeString.Mid (nWordStart + 1);
nWordStart = sXmlNodeString.Find ('\"');
if (-1 != nWordStart)
{
sXmlNodeString = sXmlNodeString.Left (nWordStart);
nBomType = CharsetStringToInt (sXmlNodeString);
}
}
}
}
}
}
}
catch (UINT eRes)
{
eRes;
nBomType = TB_ERROR;
}
return nBomType;
}
//
UINT CLiteHTMLReader::ReadFile2 (HANDLE hFile, UINT aCodePage)
{
ATLASSERT(hFile != INVALID_HANDLE_VALUE);
ATLASSERT(::GetFileType(hFile) == FILE_TYPE_DISK);
int nBomSize (0);
TBOMType nCodePage = DetectCodePage (hFile, nBomSize);
HANDLE hFileMap (NULL);
LPCSTR lpsz (NULL);
UINT nRetVal (0);
CString sHtml;
try
{
// determine file size
m_dwBufLen = ::GetFileSize(hFile, NULL);
if (m_dwBufLen == INVALID_FILE_SIZE)
{
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
" GetFileSize() failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
throw;
}
if (0 == m_dwBufLen)
return (0U);
// create a file-mapping object for the file
hFileMap = ::CreateFileMapping(hFile, NULL, PAGE_READONLY, 0L, 0L, NULL);
if (hFileMap == NULL)
{
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
" CreateFileMapping() failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
throw;
}
// map the entire file into the address-space of the application
lpsz = (LPCSTR)::MapViewOfFile(hFileMap, FILE_MAP_READ, 0L, 0L, 0L);
if (lpsz == NULL)
{
ATLTRACE2("(Error) CLiteHTMLReader::Read:"
" MapViewOfFile() failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
throw;
}
// skip BOM
lpsz += nBomSize;
m_dwBufLen -= nBomSize;
bool bSimpleConvertToUnicode = true;
switch (nCodePage)
{
case TB_UNKNOWN:
//aCodePage = aCodePage; // do nothing (default code page passed through func params)
break;
case TB_UTF7:
aCodePage = CP_UTF7;
break;
case TB_UTF8:
aCodePage = CP_UTF8;
break;
case TB_WINDOWS1250:
aCodePage = 1250;
break;
case TB_WINDOWS1251:
aCodePage = 1251;
break;
case TB_WINDOWS1252:
case TB_ISO8859_1: // html5 says that
aCodePage = 1252;
break;
case TB_WINDOWS1253:
aCodePage = 1253;
break;
case TB_WINDOWS1254:
aCodePage = 1254;
break;
case TB_WINDOWS1255:
aCodePage = 1255;
break;
case TB_WINDOWS1256:
aCodePage = 1256;
break;
case TB_WINDOWS1257:
aCodePage = 1257;
break;
case TB_WINDOWS1258:
aCodePage = 1258;
break;
case TB_WINDOWS874:
aCodePage = 874;
break;
case TB_SHIFT_JIS:
aCodePage = 932;
break;
default:
// need complex conversion
bSimpleConvertToUnicode = false;
break;
}
if (bSimpleConvertToUnicode)
{
BOOL bRes = ConvertMultibyteToUnicode (aCodePage, lpsz, m_dwBufLen, sHtml);
if (!bRes)
throw;
}
else
{
if (TB_KOI8R == nCodePage)
{
BOOL bRes = ConvertKoi8RToUnicode (lpsz, m_dwBufLen, sHtml);
if (!bRes)
throw;
}
else if (TB_UTF16LE == nCodePage)
{
const int nSizeInChars = m_dwBufLen / sizeof (WCHAR);
WCHAR *pwcBuffer = sHtml.GetBufferSetLength (nSizeInChars);
if (NULL != pwcBuffer)
{
::memcpy (pwcBuffer, lpsz, nSizeInChars * sizeof (WCHAR)); // even quantity of chars
sHtml.ReleaseBuffer ();
}
}
else if (TB_UTF16BE == nCodePage)
{
const int nSizeInChars = m_dwBufLen / sizeof (WCHAR);
WCHAR *pwcBuffer = sHtml.GetBufferSetLength (nSizeInChars);
if (NULL != pwcBuffer)
{
for (int nChar = 0; nChar < nSizeInChars; nChar ++)
{
pwcBuffer[nChar] = lpsz[(nChar << 1) + 1] | (lpsz[nChar << 1] << 8);
}
sHtml.ReleaseBuffer ();
}
}
}
// delete script
DeleteTags (sHtml, _T("script"));
DeleteTags (sHtml, _T("SCRIPT"));
DeleteComments (sHtml);
m_lpszBuffer = sHtml.GetBuffer();
nRetVal = parseDocument ();
sHtml.ReleaseBuffer();
}
catch (...)
{
nRetVal = 0U;
m_dwBufLen = 0L;
}
if (lpsz != NULL)
ATLVERIFY(::UnmapViewOfFile(lpsz));
if (hFileMap)
ATLVERIFY(::CloseHandle(hFileMap));
return (nRetVal);
}
// static
BOOL CLiteHTMLReader::ConvertKoi8RToUnicode (const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw()
{
ATLASSERT (aInput);
ATLASSERT (aInputSize);
// rfc-1489
static const USHORT koi82unicode[128] = {
0x2500,0x2502,0x250c,0x2510,0x2514,0x2518,0x251c,0x2524,
0x252c,0x2534,0x253c,0x2580,0x2584,0x2588,0x258c,0x2590,
0x2591,0x2592,0x2593,0x2320,0x25a0,0x2219,0x221a,0x2248,
0x2264,0x2265,0x00a0,0x2321,0x00b0,0x00b2,0x00b7,0x00f7,
0x2550,0x2551,0x2552,0x0451,0x2553,0x2554,0x2555,0x2556,
0x2557,0x2558,0x2559,0x255a,0x255b,0x255c,0x255d,0x255e,
0x255f,0x2560,0x2561,0x0401,0x2562,0x2563,0x2564,0x2565,
0x2566,0x2567,0x2568,0x2569,0x256a,0x256b,0x256c,0x00a9,
0x044e,0x0430,0x0431,0x0446,0x0434,0x0435,0x0444,0x0433,
0x0445,0x0438,0x0439,0x043a,0x043b,0x043c,0x043d,0x043e,
0x043f,0x044f,0x0440,0x0441,0x0442,0x0443,0x0436,0x0432,
0x044c,0x044b,0x0437,0x0448,0x044d,0x0449,0x0447,0x044a,
0x042e,0x0410,0x0411,0x0426,0x0414,0x0415,0x0424,0x0413,
0x0425,0x0418,0x0419,0x041a,0x041b,0x041c,0x041d,0x041e,
0x041f,0x042f,0x0420,0x0421,0x0422,0x0423,0x0416,0x0412,
0x042c,0x042b,0x0417,0x0428,0x042d,0x0429,0x0427,0x042a
};
//aOut.Preallocate (aInputSize);
WCHAR *pUnicodeBuffer = aOut.GetBufferSetLength (aInputSize);
if (NULL == pUnicodeBuffer)
return FALSE;
for (int nChar = 0; nChar < aInputSize; nChar++)
{
const BYTE cKoi8Char = BYTE (aInput[nChar]);
const USHORT wcUnicodeChar = cKoi8Char < 128 ? cKoi8Char : koi82unicode [cKoi8Char - 128];
pUnicodeBuffer[nChar] = (WCHAR) wcUnicodeChar;
//aOut.SetAt(nChar, (WCHAR) wcUnicodeChar);
}
aOut.ReleaseBuffer ();
return TRUE;
}
/*
// static
BOOL CLiteHTMLReader::ConvertMultibyteToUnicode (const UINT aCodePage, const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw()
{
ATLASSERT (aInput);
ATLASSERT (aInputSize);
const int nUnicodeSize = MultiByteToWideChar (aCodePage, 0, aInput, aInputSize, NULL, 0);
WCHAR *pwStr = NULL;
if (0 != nUnicodeSize)
{
try
{
pwStr = new WCHAR [nUnicodeSize];
}
catch (std::bad_alloc& ba)
{
ATLTRACE2("(Error) CLiteHTMLReader::ConvertMultibyteToUnicode:"
"new[] failed;"
" exception: %s\n", ba.what ());
}
if (NULL == pwStr)
{
ATLTRACE2("(Error) CLiteHTMLReader::ConvertMultibyteToUnicode:"
" new failed;"
" GetLastError() returns 0x%08x.\n", ::GetLastError());
return FALSE;
}
const int nStrSize = ::MultiByteToWideChar (aCodePage, 0, aInput, aInputSize, pwStr, nUnicodeSize);
ATLTRACE2 ("CLiteHTMLReader::ConvertMultibyteToUnicode(): %i bytes has been converted to unicode\n", nStrSize);
}
// Preprocess
if (NULL != pwStr)
{
aOut = pwStr;
delete [] pwStr;
pwStr = NULL;
return TRUE;
}
return FALSE;
}
*/
#pragma warning(pop)