Files
core/ASCOfficeHtmlFile_SaxHtmlParser/HTMLReaderLib/LiteHTMLReader.h

475 lines
12 KiB
C++

#pragma once
#include "LiteHTMLTag.h"
#include "UnicodeTextFile.h"
using namespace NUnicodeTextFile;
#pragma warning(push, 4)
class CLiteHTMLReader;
/**
* ILiteHTMLReaderEvents
* interface for events
*/
class ILiteHTMLReaderEvents
{
friend class CLiteHTMLReader;
// Events
protected:
virtual void BeginParse(DWORD dwAppData, bool &bAbort)
{
dwAppData;
bAbort = false;
}
virtual void StartTag(CLiteHTMLTag *pTag, DWORD dwAppData, bool &bAbort)
{
pTag;
dwAppData;
bAbort = false;
}
virtual void EndTag(CLiteHTMLTag *pTag, DWORD dwAppData, bool &bAbort)
{
pTag;
dwAppData;
bAbort = false;
}
virtual void Characters(const CString &rText, DWORD dwAppData, bool &bAbort)
{
rText;
dwAppData;
bAbort = false;
}
virtual void Comment(const CString &rComment, DWORD dwAppData, bool &bAbort)
{
rComment;
dwAppData;
bAbort = false;
}
virtual void EndParse(DWORD dwAppData, bool bIsAborted)
{
dwAppData;
bIsAborted;
}
public:
virtual ~ILiteHTMLReaderEvents() = 0
{
}
};
class CLiteHTMLReader
{
public:
enum EventMaskEnum {
/** @since 1.0 */
notifyStartStop = 0x00000001L, // raise BeginParse and EndParse?
/** @since 1.0 */
notifyTagStart = 0x00000002L, // raise StartTag?
/** @since 1.0 */
notifyTagEnd = 0x00000004L, // raise EndTag?
/** @since 1.0 */
notifyCharacters = 0x00000008L, // raise Characters?
/** @since 1.0 */
notifyComment = 0x00000010L, // raise Comment?
};
enum ReaderOptionsEnum {
/** @since 1.0 */
resolveEntities, // determines whether entity references should be resolved
// TODO:
// TODO: add more reader options
// TODO:
};
// Construction/Destruction
public:
CLiteHTMLReader()
{
m_bResolveEntities = true; // entities are resolved, by default
m_dwAppData = 0L; // reasonable default!
m_dwBufPos = 0L; // start from the very beginning
m_dwBufLen = 0L; // buffer length is unknown yet
// default is to raise all of the events
m_eventMask = (EventMaskEnum)(notifyStartStop |
notifyTagStart |
notifyTagEnd |
notifyCharacters |
notifyComment );
m_pEventHandler = NULL; // no event handler is associated
m_lpszBuffer = NULL;
}
public:
/**
* Returns an event mask which signifies the notification
* messages a CLiteHTMLReader will send while parsing HTML
* text.
*/
EventMaskEnum getEventMask(void) const
{ return (m_eventMask); }
/**
* Sets a new event mask.
*
* @param dwNewEventMask - new event mask
*/
EventMaskEnum setEventMask(DWORD dwNewEventMask)
{
EventMaskEnum oldMask = m_eventMask;
m_eventMask = (EventMaskEnum)dwNewEventMask;
return (oldMask);
}
/**
* Changes the current event mask by adding and removing
* flags specified by addFlags and removeFlags, respectively.
*
* @param addFlags - flags to add in the current event mask
* @param removeFlags - flags to remove from the current event mask
*/
EventMaskEnum setEventMask(DWORD addFlags, DWORD removeFlags)
{
DWORD dwOldMask = (DWORD)m_eventMask;
DWORD dwNewMask = (dwOldMask | addFlags) & ~removeFlags;
m_eventMask = (EventMaskEnum)dwNewMask;
return ((EventMaskEnum)dwOldMask);
}
/**
* Returns a 32-bit application-specific data
* previously set by a call to setAppData()
*/
DWORD getAppData(void) const
{ return (m_dwAppData); }
/**
* Allows you to store 32-bit application-specific
* data that will be passed to event handlers on each call
*
* @param dwNewAppData - Application-specific data
*/
DWORD setAppData(DWORD dwNewAppData)
{
DWORD dwOldAppData = m_dwAppData;
m_dwAppData = dwNewAppData;
return (dwOldAppData);
}
/**
* Returns a pointer to an event handler registered with
* a CLiteHTMLReader by a previous call to setEventHandler().
*/
ILiteHTMLReaderEvents* getEventHandler(void) const
{ return (m_pEventHandler); }
/**
* Registers an event handler with a CLiteHTMLReader. If no
* event handler is registered with the reader, all events
* raised by the reader will be ignored. An application can
* change the event handler even when the parsing process
* is in progress.
*
* @param pNewHandler - pointer to an event handler.
* This parameter can be NULL also.
*/
ILiteHTMLReaderEvents* setEventHandler(ILiteHTMLReaderEvents* pNewHandler)
{
ILiteHTMLReaderEvents *pOldHandler = m_pEventHandler;
m_pEventHandler = pNewHandler;
return (pOldHandler);
}
// returns the current value for the specified option
bool getBoolOption(ReaderOptionsEnum option, bool& bCurVal) const;
// sets a new value for the specified option
bool setBoolOption(ReaderOptionsEnum option, bool bNewVal);
// Operations
public:
// parses an HTML document from the specified string
UINT Read(LPCTSTR lpszString);
// parses an HTML document from a file given its HANDLE
UINT ReadFile(HANDLE hFile);
UINT ReadFile2 (HANDLE hFile, UINT aCodePage = CP_ACP);
TBOMType DetectCodePage (HANDLE hFile, int &nBomSize);
//static BOOL ConvertMultibyteToUnicode (const UINT aCodePage, const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw();
static BOOL ConvertKoi8RToUnicode (const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw();
static void DeleteTags (CString &sHtml, CString sTagName);
static void DeleteComments (CString &sHtml);
// Helpers
protected:
/** Parsing Helpers */
// parses an HTML document, and returns the
// number of characters successfully parsed
virtual UINT parseDocument(void);
// parses an HTML comment from the buffer starting from
// the current buffer position and returns true on sucess
virtual bool parseComment(CString &rComment);
// parses an HTML tag from the buffer starting from
// the current buffer position and returns true on success
virtual bool parseTag(CLiteHTMLTag &rTag, bool &bIsOpeningTag, bool &bIsClosingTag, TTagParsingMode &aTagParsingMode);
virtual void NormalizeCharacters(CString &rCharacters)
{
rCharacters;
//rCharacters.Replace(_T("\r\n"), _T(""));
//rCharacters.Remove(_T('\n'));
//rCharacters.Replace(_T('\r'), _T(' '));
//rCharacters.Replace(_T('\t'), _T(' '));
}
/** Buffer Manipulation Helpers */
/**
* Resets the buffer position back to the beginning
*/
void ResetSeekPointer(void)
{ m_dwBufPos = 0L; }
/**
* Reads the next character and advances the buffer position
*/
TCHAR ReadChar(void)
{
ATLASSERT(m_lpszBuffer != NULL);
if (m_dwBufPos >= m_dwBufLen)
return (NULL);
return (m_lpszBuffer[m_dwBufPos++]);
}
/**
* Moves the buffer back by one TCHAR
*/
TCHAR UngetChar(void)
{
ATLASSERT(m_lpszBuffer != NULL);
ATLASSERT(m_dwBufPos);
return (m_lpszBuffer[--m_dwBufPos]);
}
/** Other Helpers */
/**
* Determines if the specified event's notification is to be raised
*/
bool getEventNotify(DWORD dwEvent) const
{
ATLASSERT(dwEvent == notifyStartStop ||
dwEvent == notifyTagStart ||
dwEvent == notifyTagEnd ||
dwEvent == notifyCharacters ||
dwEvent == notifyComment);
if (m_pEventHandler == NULL)
return (false);
return ((m_eventMask & dwEvent) == dwEvent);
}
/**
* Determines if the character specified by ch is
* a white-space character. White-space characters
* are defined as ASCII 0x9-0xD,0x20
*/
bool isWhiteSpace(TCHAR ch) const
{ return (::_istspace(ch) ? true : false); }
protected:
/**
* Determines if character entities are to be resolved
* Default is true.
*/
bool m_bResolveEntities;
/**
* 32-bit app-specific data (to be passed to callbacks)
* Default is 0.
*/
DWORD m_dwAppData;
/**
* Position of the seek pointer
*/
DWORD m_dwBufPos;
/**
* size, in TCHARs, of the buffer
*/
DWORD m_dwBufLen;
/**
* Bit-mask flags to customize events notification(s)
* Default is the ORed result of all EventMaskEnum flags.
*/
EventMaskEnum m_eventMask;
/**
* Pointer to an ILiteHTMLReaderEvents based event handling object
* Default is NULL
*/
ILiteHTMLReaderEvents* m_pEventHandler;
/**
* Pointer to an array of characters being parsed
*/
LPCTSTR m_lpszBuffer;
};
/**
* Returns the current value for the specified option.
*
* @param option - option to inquire
* @param bCurVal - this will receive the current value for the option.
*/
inline bool CLiteHTMLReader::getBoolOption(ReaderOptionsEnum option, bool& bCurVal) const
{
bool bSuccess = false;
switch (option)
{
case resolveEntities:
{
bCurVal = m_bResolveEntities;
bSuccess = true;
break;
}
default:
{
bSuccess = false;
break;
}
}
return (bSuccess);
}
/**
* Changes the value of an option and returns
* true/false indicating if the specified option
* was set successfully.
*
* @param option - option to change
* (one of the ReaderOptionsEnum constants)
* @param bNewVal - value to set
*
* @return true if option was set successfully; otherwise false.
*/
inline bool CLiteHTMLReader::setBoolOption(ReaderOptionsEnum option, bool bNewVal)
{
bool bSuccess = false;
switch (option)
{
case resolveEntities:
{
m_bResolveEntities = bNewVal;
bSuccess = true;
break;
}
default:
{
bSuccess = false;
break;
}
}
return (bSuccess);
}
/**
* Parses an HTML comment starting from the current buffer position.
*
* @param rComment - this will receive the comment (without delimeters)
*
* @return true if successful, false otherwise
*/
inline bool CLiteHTMLReader::parseComment(CString &rComment)
{
ATLASSERT(m_lpszBuffer != NULL);
ATLASSERT(m_dwBufPos >= 0L);
ATLASSERT(m_dwBufPos + 4 < m_dwBufLen);
// HTML comments begin with '<!' delimeter and
// are immediately followed by two hyphens '--'
if (::_tcsncmp(&m_lpszBuffer[m_dwBufPos], _T("<!--"), 4))
return (false);
LPCTSTR lpszBegin = &m_lpszBuffer[m_dwBufPos + 4];
// HTML comments end with two hyphen symbols '--'
LPCTSTR lpszEnd = ::_tcsstr(lpszBegin, _T("--"));
// comment ending delimeter could not be found?
if (lpszEnd == NULL)
// consider everything after current buffer position a comment
{
rComment = lpszBegin;
m_dwBufPos += (4 + rComment.GetLength());
return (true);
}
CString strComment(lpszBegin, int (lpszEnd - lpszBegin));
// end of buffer?
if (lpszEnd + 2 >= m_lpszBuffer + m_dwBufLen)
return (false);
// skip white-space characters after comment ending delimeter '--'
lpszEnd += 2;
while (::_istspace(*lpszEnd))
lpszEnd = ::_tcsinc(lpszEnd);
// comment has not been terminated properly
if (*lpszEnd != _T('>'))
return (false);
lpszEnd = ::_tcsinc(lpszEnd);
m_dwBufPos += (DWORD) (lpszEnd - &m_lpszBuffer[m_dwBufPos]);
rComment = strComment;
return (true);
}
/**
* Parses an HTML tag starting from the current buffer position.
*
* @param rTag - this will receive tag information (along with its attributes)
* @param bIsOpeningTag - receives true if the tag parsed is a opening tag.
* @param bIsClosingTag - receives true if the tag parsed is a closing tag.
*
* @return true if successful, false otherwise
*/
inline bool CLiteHTMLReader::parseTag(CLiteHTMLTag &rTag,
bool &bIsOpeningTag,
bool &bIsClosingTag,
TTagParsingMode &aTagParsingMode)
{
ATLASSERT(m_lpszBuffer != NULL);
ATLASSERT(m_dwBufPos >= 0L);
ATLASSERT(m_dwBufPos + 4 < m_dwBufLen);
UINT nRetVal = rTag.parseFromStr(&m_lpszBuffer[m_dwBufPos],
bIsOpeningTag, bIsClosingTag, aTagParsingMode, m_dwBufLen - m_dwBufPos);
if (!nRetVal)
return (false);
m_dwBufPos += nRetVal;
return (true);
}
#pragma warning(pop)