core/ASCOfficeHtmlFile_SaxHtmlParser/HTMLReaderLib/LiteHTMLReader.h

#pragma once
#include "LiteHTMLTag.h"
#include "UnicodeTextFile.h"

using namespace NUnicodeTextFile;


#pragma warning(push, 4)

class CLiteHTMLReader;

/**
 * ILiteHTMLReaderEvents
 * interface for events
 */
class ILiteHTMLReaderEvents
{
	friend class CLiteHTMLReader;

// Events
protected:
	virtual void BeginParse(DWORD dwAppData, bool &bAbort)
	{
		dwAppData;
		bAbort = false;
	}

	virtual void StartTag(CLiteHTMLTag *pTag, DWORD dwAppData, bool &bAbort)
	{
		pTag;
		dwAppData;
		bAbort = false;
	}

	virtual void EndTag(CLiteHTMLTag *pTag, DWORD dwAppData, bool &bAbort)
	{
		pTag;
		dwAppData;
		bAbort = false;
	}

	virtual void Characters(const CString &rText, DWORD dwAppData, bool &bAbort)
	{
		rText;
		dwAppData;
		bAbort = false;
	}

	virtual void Comment(const CString &rComment, DWORD dwAppData, bool &bAbort)
	{
		rComment;
		dwAppData;
		bAbort = false;
	}

	virtual void EndParse(DWORD dwAppData, bool bIsAborted)
	{
		dwAppData;
		bIsAborted;
	}

public:
	virtual ~ILiteHTMLReaderEvents() = 0
	{
	}
};


class CLiteHTMLReader
{
public:
	enum EventMaskEnum {
		/** @since 1.0 */
		notifyStartStop		= 0x00000001L,	// raise BeginParse and EndParse?

		/** @since 1.0 */
		notifyTagStart		= 0x00000002L,	// raise StartTag?

		/** @since 1.0 */
		notifyTagEnd		= 0x00000004L,	// raise EndTag?

		/** @since 1.0 */
		notifyCharacters	= 0x00000008L,	// raise Characters?

		/** @since 1.0 */
		notifyComment		= 0x00000010L,	// raise Comment?
	};

	enum ReaderOptionsEnum {
		/** @since 1.0 */
		resolveEntities,	// determines whether entity references should be resolved

		// TODO:
		// TODO: add more reader options
		// TODO:
	};

// Construction/Destruction
public:
	CLiteHTMLReader()
	{
		m_bResolveEntities = true;	// entities are resolved, by default
		m_dwAppData = 0L;	// reasonable default!
		m_dwBufPos = 0L;	// start from the very beginning
		m_dwBufLen = 0L;	// buffer length is unknown yet

		// default is to raise all of the events
		m_eventMask = (EventMaskEnum)(notifyStartStop  |
									  notifyTagStart   |
									  notifyTagEnd     |
									  notifyCharacters |
									  notifyComment    );

		m_pEventHandler = NULL;	// no event handler is associated
		m_lpszBuffer = NULL;
	}

public:
	/**
	 * Returns an event mask which signifies the notification
	 * messages a CLiteHTMLReader will send while parsing HTML
	 * text.
	 */
	EventMaskEnum getEventMask(void) const
		{ return (m_eventMask); }

	/**
	 * Sets a new event mask.
	 *
	 * @param dwNewEventMask - new event mask
	 */
	EventMaskEnum setEventMask(DWORD dwNewEventMask)
	{
		EventMaskEnum	oldMask = m_eventMask;
		m_eventMask = (EventMaskEnum)dwNewEventMask;
		return (oldMask);
	}

	/**
	 * Changes the current event mask by adding and removing
	 * flags specified by addFlags and removeFlags, respectively.
	 *
	 * @param addFlags - flags to add in the current event mask
	 * @param removeFlags - flags to remove from the current event mask
	 */
	EventMaskEnum setEventMask(DWORD addFlags, DWORD removeFlags)
	{
		DWORD	dwOldMask = (DWORD)m_eventMask;
		DWORD	dwNewMask = (dwOldMask | addFlags) & ~removeFlags;
		m_eventMask = (EventMaskEnum)dwNewMask;
		return ((EventMaskEnum)dwOldMask);
	}

	/**
	 * Returns a 32-bit application-specific data
	 * previously set by a call to setAppData()
	 */
	DWORD getAppData(void) const
		{ return (m_dwAppData); }

	/**
	 * Allows you to store 32-bit application-specific
	 * data that will be passed to event handlers on each call
	 *
	 * @param dwNewAppData - Application-specific data
	 */
	DWORD setAppData(DWORD dwNewAppData)
	{
		DWORD	dwOldAppData = m_dwAppData;
		m_dwAppData = dwNewAppData;
		return (dwOldAppData);
	}

	/**
	 * Returns a pointer to an event handler registered with
	 * a CLiteHTMLReader by a previous call to setEventHandler().
	 */
	ILiteHTMLReaderEvents* getEventHandler(void) const
		{ return (m_pEventHandler); }

	/**
	 * Registers an event handler with a CLiteHTMLReader. If no
	 * event handler is registered with the reader, all events
	 * raised by the reader will be ignored. An application can
	 * change the event handler even when the parsing process
	 * is in progress.
	 *
	 * @param pNewHandler - pointer to an event handler.
	 *        This parameter can be NULL also.
	 */
	ILiteHTMLReaderEvents* setEventHandler(ILiteHTMLReaderEvents* pNewHandler)
	{
		ILiteHTMLReaderEvents *pOldHandler = m_pEventHandler;
		m_pEventHandler = pNewHandler;
		return (pOldHandler);
	}

	// returns the current value for the specified option
	bool getBoolOption(ReaderOptionsEnum option, bool& bCurVal) const;
	// sets a new value for the specified option
	bool setBoolOption(ReaderOptionsEnum option, bool bNewVal);

// Operations
public:
	// parses an HTML document from the specified string
	UINT Read(LPCTSTR lpszString);
	// parses an HTML document from a file given its HANDLE
	UINT ReadFile(HANDLE hFile);
	UINT ReadFile2 (HANDLE hFile, UINT aCodePage = CP_ACP);
	TBOMType DetectCodePage (HANDLE hFile, int &nBomSize);
	//static BOOL ConvertMultibyteToUnicode (const UINT aCodePage, const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw();
	static BOOL ConvertKoi8RToUnicode (const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw();
	static void DeleteTags (CString &sHtml, CString sTagName);
	static void DeleteComments (CString &sHtml);

// Helpers
protected:
	/** Parsing Helpers */

	// parses an HTML document, and returns the
	// number of characters successfully parsed
	virtual UINT parseDocument(void);

	// parses an HTML comment from the buffer starting from
	// the current buffer position and returns true on sucess
	virtual bool parseComment(CString &rComment);

	// parses an HTML tag from the buffer starting from
	// the current buffer position and returns true on success
	virtual bool parseTag(CLiteHTMLTag &rTag, bool &bIsOpeningTag, bool &bIsClosingTag, TTagParsingMode &aTagParsingMode);
	virtual void NormalizeCharacters(CString &rCharacters)
	{
		rCharacters;
		//rCharacters.Replace(_T("\r\n"), _T(""));
		//rCharacters.Remove(_T('\n'));
		//rCharacters.Replace(_T('\r'), _T(' '));
		//rCharacters.Replace(_T('\t'), _T(' '));
	}

	/** Buffer Manipulation Helpers */

	/**
	 * Resets the buffer position back to the beginning
	 */
	void ResetSeekPointer(void)
		{ m_dwBufPos = 0L; }

	/**
	 * Reads the next character and advances the buffer position
	 */
	TCHAR ReadChar(void)
	{
		ATLASSERT(m_lpszBuffer != NULL);
		if (m_dwBufPos >= m_dwBufLen)
			return (NULL);
		return (m_lpszBuffer[m_dwBufPos++]);
	}

	/**
	 * Moves the buffer back by one TCHAR
	 */
	TCHAR UngetChar(void)
	{
		ATLASSERT(m_lpszBuffer != NULL);
		ATLASSERT(m_dwBufPos);
		return (m_lpszBuffer[--m_dwBufPos]);
	}

	/** Other Helpers */

	/**
	 * Determines if the specified event's notification is to be raised
	 */
	bool getEventNotify(DWORD dwEvent) const
	{
		ATLASSERT(dwEvent == notifyStartStop  ||
			   dwEvent == notifyTagStart   ||
			   dwEvent == notifyTagEnd     ||
			   dwEvent == notifyCharacters ||
			   dwEvent == notifyComment);
		if (m_pEventHandler == NULL)
			return (false);
		return ((m_eventMask & dwEvent) == dwEvent);
	}

	/**
	 * Determines if the character specified by ch is
	 * a white-space character. White-space characters
	 * are defined as ASCII 0x9-0xD,0x20
	 */
	bool isWhiteSpace(TCHAR ch) const
		{ return (::_istspace(ch) ? true : false); }

protected:
	/**
	 * Determines if character entities are to be resolved
	 * Default is true.
	 */
	bool	m_bResolveEntities;

	/**
	 * 32-bit app-specific data (to be passed to callbacks)
	 * Default is 0.
	 */
	DWORD	m_dwAppData;

	/**
	 * Position of the seek pointer
	 */
	DWORD	m_dwBufPos;

	/**
	 * size, in TCHARs, of the buffer
	 */
	DWORD	m_dwBufLen;

	/**
	 * Bit-mask flags to customize events notification(s)
	 * Default is the ORed result of all EventMaskEnum flags.
	 */
	EventMaskEnum	m_eventMask;

	/**
	 * Pointer to an ILiteHTMLReaderEvents based event handling object
	 * Default is NULL
	 */
	ILiteHTMLReaderEvents*	m_pEventHandler;

	/**
	 * Pointer to an array of characters being parsed
	 */
	LPCTSTR	m_lpszBuffer;
};

/**
 * Returns the current value for the specified option.
 *
 * @param option - option to inquire
 * @param bCurVal - this will receive the current value for the option.
 */
inline bool CLiteHTMLReader::getBoolOption(ReaderOptionsEnum option, bool& bCurVal) const
{
	bool bSuccess = false;

	switch (option)
	{
	case resolveEntities:
		{
			bCurVal = m_bResolveEntities;
			bSuccess = true;
			break;
		}
	default:
		{
			bSuccess = false;
			break;
		}
	}
	return (bSuccess);
}

/**
 * Changes the value of an option and returns
 * true/false indicating if the specified option
 * was set successfully.
 *
 * @param option - option to change
 *                 (one of the ReaderOptionsEnum constants)
 * @param bNewVal - value to set
 *
 * @return true if option was set successfully; otherwise false.
 */
inline bool CLiteHTMLReader::setBoolOption(ReaderOptionsEnum option, bool bNewVal)
{
	bool bSuccess = false;

	switch (option)
	{
	case resolveEntities:
		{
			m_bResolveEntities = bNewVal;
			bSuccess = true;
			break;
		}
	default:
		{
			bSuccess = false;
			break;
		}
	}
	return (bSuccess);
}

/**
 * Parses an HTML comment starting from the current buffer position.
 *
 * @param rComment - this will receive the comment (without delimeters)
 *
 * @return true if successful, false otherwise
 */
inline bool CLiteHTMLReader::parseComment(CString &rComment)
{
	ATLASSERT(m_lpszBuffer != NULL);
	ATLASSERT(m_dwBufPos >= 0L);
	ATLASSERT(m_dwBufPos + 4 < m_dwBufLen);

	// HTML comments begin with '<!' delimeter and
	// are immediately followed by two hyphens '--'
	if (::_tcsncmp(&m_lpszBuffer[m_dwBufPos], _T("<!--"), 4))
		return (false);

	LPCTSTR	lpszBegin = &m_lpszBuffer[m_dwBufPos + 4];
	// HTML comments end with two hyphen symbols '--'
	LPCTSTR	lpszEnd = ::_tcsstr(lpszBegin, _T("--"));

	// comment ending delimeter could not be found?
	if (lpszEnd == NULL)
		// consider everything after current buffer position a comment
	{
		rComment = lpszBegin;
		m_dwBufPos += (4 + rComment.GetLength());
		return (true);
	}

	CString	strComment(lpszBegin, int (lpszEnd - lpszBegin));

	// end of buffer?
	if (lpszEnd + 2 >= m_lpszBuffer + m_dwBufLen)
		return (false);

	// skip white-space characters after comment ending delimeter '--'
	lpszEnd += 2;
	while (::_istspace(*lpszEnd))
		lpszEnd = ::_tcsinc(lpszEnd);

	// comment has not been terminated properly
	if (*lpszEnd != _T('>'))
		return (false);

	lpszEnd = ::_tcsinc(lpszEnd);
	m_dwBufPos += (DWORD) (lpszEnd - &m_lpszBuffer[m_dwBufPos]);
	rComment = strComment;
	return (true);
}

/**
 * Parses an HTML tag starting from the current buffer position.
 *
 * @param rTag - this will receive tag information (along with its attributes)
 * @param bIsOpeningTag - receives true if the tag parsed is a opening tag.
 * @param bIsClosingTag - receives true if the tag parsed is a closing tag.
 *
 * @return true if successful, false otherwise
 */
inline bool CLiteHTMLReader::parseTag(CLiteHTMLTag &rTag,
									  bool &bIsOpeningTag,
									  bool &bIsClosingTag,
									  TTagParsingMode &aTagParsingMode)
{
	ATLASSERT(m_lpszBuffer != NULL);
	ATLASSERT(m_dwBufPos >= 0L);
	ATLASSERT(m_dwBufPos + 4 < m_dwBufLen);

	UINT nRetVal = rTag.parseFromStr(&m_lpszBuffer[m_dwBufPos],
					bIsOpeningTag, bIsClosingTag, aTagParsingMode, m_dwBufLen - m_dwBufPos);
	if (!nRetVal)
		return (false);

	m_dwBufPos += nRetVal;
	return (true);
}

#pragma warning(pop)