mirror of
https://github.com/ONLYOFFICE/core.git
synced 2026-04-07 13:55:33 +08:00
475 lines
12 KiB
C++
475 lines
12 KiB
C++
#pragma once
|
|
#include "LiteHTMLTag.h"
|
|
#include "UnicodeTextFile.h"
|
|
|
|
using namespace NUnicodeTextFile;
|
|
|
|
|
|
#pragma warning(push, 4)
|
|
|
|
class CLiteHTMLReader;
|
|
|
|
/**
|
|
* ILiteHTMLReaderEvents
|
|
* interface for events
|
|
*/
|
|
class ILiteHTMLReaderEvents
|
|
{
|
|
friend class CLiteHTMLReader;
|
|
|
|
// Events
|
|
protected:
|
|
virtual void BeginParse(DWORD dwAppData, bool &bAbort)
|
|
{
|
|
dwAppData;
|
|
bAbort = false;
|
|
}
|
|
|
|
virtual void StartTag(CLiteHTMLTag *pTag, DWORD dwAppData, bool &bAbort)
|
|
{
|
|
pTag;
|
|
dwAppData;
|
|
bAbort = false;
|
|
}
|
|
|
|
virtual void EndTag(CLiteHTMLTag *pTag, DWORD dwAppData, bool &bAbort)
|
|
{
|
|
pTag;
|
|
dwAppData;
|
|
bAbort = false;
|
|
}
|
|
|
|
virtual void Characters(const CString &rText, DWORD dwAppData, bool &bAbort)
|
|
{
|
|
rText;
|
|
dwAppData;
|
|
bAbort = false;
|
|
}
|
|
|
|
virtual void Comment(const CString &rComment, DWORD dwAppData, bool &bAbort)
|
|
{
|
|
rComment;
|
|
dwAppData;
|
|
bAbort = false;
|
|
}
|
|
|
|
virtual void EndParse(DWORD dwAppData, bool bIsAborted)
|
|
{
|
|
dwAppData;
|
|
bIsAborted;
|
|
}
|
|
|
|
public:
|
|
virtual ~ILiteHTMLReaderEvents() = 0
|
|
{
|
|
}
|
|
};
|
|
|
|
|
|
class CLiteHTMLReader
|
|
{
|
|
public:
|
|
enum EventMaskEnum {
|
|
/** @since 1.0 */
|
|
notifyStartStop = 0x00000001L, // raise BeginParse and EndParse?
|
|
|
|
/** @since 1.0 */
|
|
notifyTagStart = 0x00000002L, // raise StartTag?
|
|
|
|
/** @since 1.0 */
|
|
notifyTagEnd = 0x00000004L, // raise EndTag?
|
|
|
|
/** @since 1.0 */
|
|
notifyCharacters = 0x00000008L, // raise Characters?
|
|
|
|
/** @since 1.0 */
|
|
notifyComment = 0x00000010L, // raise Comment?
|
|
};
|
|
|
|
enum ReaderOptionsEnum {
|
|
/** @since 1.0 */
|
|
resolveEntities, // determines whether entity references should be resolved
|
|
|
|
// TODO:
|
|
// TODO: add more reader options
|
|
// TODO:
|
|
};
|
|
|
|
// Construction/Destruction
|
|
public:
|
|
CLiteHTMLReader()
|
|
{
|
|
m_bResolveEntities = true; // entities are resolved, by default
|
|
m_dwAppData = 0L; // reasonable default!
|
|
m_dwBufPos = 0L; // start from the very beginning
|
|
m_dwBufLen = 0L; // buffer length is unknown yet
|
|
|
|
// default is to raise all of the events
|
|
m_eventMask = (EventMaskEnum)(notifyStartStop |
|
|
notifyTagStart |
|
|
notifyTagEnd |
|
|
notifyCharacters |
|
|
notifyComment );
|
|
|
|
m_pEventHandler = NULL; // no event handler is associated
|
|
m_lpszBuffer = NULL;
|
|
}
|
|
|
|
public:
|
|
/**
|
|
* Returns an event mask which signifies the notification
|
|
* messages a CLiteHTMLReader will send while parsing HTML
|
|
* text.
|
|
*/
|
|
EventMaskEnum getEventMask(void) const
|
|
{ return (m_eventMask); }
|
|
|
|
/**
|
|
* Sets a new event mask.
|
|
*
|
|
* @param dwNewEventMask - new event mask
|
|
*/
|
|
EventMaskEnum setEventMask(DWORD dwNewEventMask)
|
|
{
|
|
EventMaskEnum oldMask = m_eventMask;
|
|
m_eventMask = (EventMaskEnum)dwNewEventMask;
|
|
return (oldMask);
|
|
}
|
|
|
|
/**
|
|
* Changes the current event mask by adding and removing
|
|
* flags specified by addFlags and removeFlags, respectively.
|
|
*
|
|
* @param addFlags - flags to add in the current event mask
|
|
* @param removeFlags - flags to remove from the current event mask
|
|
*/
|
|
EventMaskEnum setEventMask(DWORD addFlags, DWORD removeFlags)
|
|
{
|
|
DWORD dwOldMask = (DWORD)m_eventMask;
|
|
DWORD dwNewMask = (dwOldMask | addFlags) & ~removeFlags;
|
|
m_eventMask = (EventMaskEnum)dwNewMask;
|
|
return ((EventMaskEnum)dwOldMask);
|
|
}
|
|
|
|
/**
|
|
* Returns a 32-bit application-specific data
|
|
* previously set by a call to setAppData()
|
|
*/
|
|
DWORD getAppData(void) const
|
|
{ return (m_dwAppData); }
|
|
|
|
/**
|
|
* Allows you to store 32-bit application-specific
|
|
* data that will be passed to event handlers on each call
|
|
*
|
|
* @param dwNewAppData - Application-specific data
|
|
*/
|
|
DWORD setAppData(DWORD dwNewAppData)
|
|
{
|
|
DWORD dwOldAppData = m_dwAppData;
|
|
m_dwAppData = dwNewAppData;
|
|
return (dwOldAppData);
|
|
}
|
|
|
|
/**
|
|
* Returns a pointer to an event handler registered with
|
|
* a CLiteHTMLReader by a previous call to setEventHandler().
|
|
*/
|
|
ILiteHTMLReaderEvents* getEventHandler(void) const
|
|
{ return (m_pEventHandler); }
|
|
|
|
/**
|
|
* Registers an event handler with a CLiteHTMLReader. If no
|
|
* event handler is registered with the reader, all events
|
|
* raised by the reader will be ignored. An application can
|
|
* change the event handler even when the parsing process
|
|
* is in progress.
|
|
*
|
|
* @param pNewHandler - pointer to an event handler.
|
|
* This parameter can be NULL also.
|
|
*/
|
|
ILiteHTMLReaderEvents* setEventHandler(ILiteHTMLReaderEvents* pNewHandler)
|
|
{
|
|
ILiteHTMLReaderEvents *pOldHandler = m_pEventHandler;
|
|
m_pEventHandler = pNewHandler;
|
|
return (pOldHandler);
|
|
}
|
|
|
|
// returns the current value for the specified option
|
|
bool getBoolOption(ReaderOptionsEnum option, bool& bCurVal) const;
|
|
// sets a new value for the specified option
|
|
bool setBoolOption(ReaderOptionsEnum option, bool bNewVal);
|
|
|
|
// Operations
|
|
public:
|
|
// parses an HTML document from the specified string
|
|
UINT Read(LPCTSTR lpszString);
|
|
// parses an HTML document from a file given its HANDLE
|
|
UINT ReadFile(HANDLE hFile);
|
|
UINT ReadFile2 (HANDLE hFile, UINT aCodePage = CP_ACP);
|
|
TBOMType DetectCodePage (HANDLE hFile, int &nBomSize);
|
|
//static BOOL ConvertMultibyteToUnicode (const UINT aCodePage, const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw();
|
|
static BOOL ConvertKoi8RToUnicode (const LPCSTR aInput, const int aInputSize, CStringW &aOut) throw();
|
|
static void DeleteTags (CString &sHtml, CString sTagName);
|
|
static void DeleteComments (CString &sHtml);
|
|
|
|
// Helpers
|
|
protected:
|
|
/** Parsing Helpers */
|
|
|
|
// parses an HTML document, and returns the
|
|
// number of characters successfully parsed
|
|
virtual UINT parseDocument(void);
|
|
|
|
// parses an HTML comment from the buffer starting from
|
|
// the current buffer position and returns true on sucess
|
|
virtual bool parseComment(CString &rComment);
|
|
|
|
// parses an HTML tag from the buffer starting from
|
|
// the current buffer position and returns true on success
|
|
virtual bool parseTag(CLiteHTMLTag &rTag, bool &bIsOpeningTag, bool &bIsClosingTag, TTagParsingMode &aTagParsingMode);
|
|
virtual void NormalizeCharacters(CString &rCharacters)
|
|
{
|
|
rCharacters;
|
|
//rCharacters.Replace(_T("\r\n"), _T(""));
|
|
//rCharacters.Remove(_T('\n'));
|
|
//rCharacters.Replace(_T('\r'), _T(' '));
|
|
//rCharacters.Replace(_T('\t'), _T(' '));
|
|
}
|
|
|
|
/** Buffer Manipulation Helpers */
|
|
|
|
/**
|
|
* Resets the buffer position back to the beginning
|
|
*/
|
|
void ResetSeekPointer(void)
|
|
{ m_dwBufPos = 0L; }
|
|
|
|
/**
|
|
* Reads the next character and advances the buffer position
|
|
*/
|
|
TCHAR ReadChar(void)
|
|
{
|
|
ATLASSERT(m_lpszBuffer != NULL);
|
|
if (m_dwBufPos >= m_dwBufLen)
|
|
return (NULL);
|
|
return (m_lpszBuffer[m_dwBufPos++]);
|
|
}
|
|
|
|
/**
|
|
* Moves the buffer back by one TCHAR
|
|
*/
|
|
TCHAR UngetChar(void)
|
|
{
|
|
ATLASSERT(m_lpszBuffer != NULL);
|
|
ATLASSERT(m_dwBufPos);
|
|
return (m_lpszBuffer[--m_dwBufPos]);
|
|
}
|
|
|
|
/** Other Helpers */
|
|
|
|
/**
|
|
* Determines if the specified event's notification is to be raised
|
|
*/
|
|
bool getEventNotify(DWORD dwEvent) const
|
|
{
|
|
ATLASSERT(dwEvent == notifyStartStop ||
|
|
dwEvent == notifyTagStart ||
|
|
dwEvent == notifyTagEnd ||
|
|
dwEvent == notifyCharacters ||
|
|
dwEvent == notifyComment);
|
|
if (m_pEventHandler == NULL)
|
|
return (false);
|
|
return ((m_eventMask & dwEvent) == dwEvent);
|
|
}
|
|
|
|
/**
|
|
* Determines if the character specified by ch is
|
|
* a white-space character. White-space characters
|
|
* are defined as ASCII 0x9-0xD,0x20
|
|
*/
|
|
bool isWhiteSpace(TCHAR ch) const
|
|
{ return (::_istspace(ch) ? true : false); }
|
|
|
|
protected:
|
|
/**
|
|
* Determines if character entities are to be resolved
|
|
* Default is true.
|
|
*/
|
|
bool m_bResolveEntities;
|
|
|
|
/**
|
|
* 32-bit app-specific data (to be passed to callbacks)
|
|
* Default is 0.
|
|
*/
|
|
DWORD m_dwAppData;
|
|
|
|
/**
|
|
* Position of the seek pointer
|
|
*/
|
|
DWORD m_dwBufPos;
|
|
|
|
/**
|
|
* size, in TCHARs, of the buffer
|
|
*/
|
|
DWORD m_dwBufLen;
|
|
|
|
/**
|
|
* Bit-mask flags to customize events notification(s)
|
|
* Default is the ORed result of all EventMaskEnum flags.
|
|
*/
|
|
EventMaskEnum m_eventMask;
|
|
|
|
/**
|
|
* Pointer to an ILiteHTMLReaderEvents based event handling object
|
|
* Default is NULL
|
|
*/
|
|
ILiteHTMLReaderEvents* m_pEventHandler;
|
|
|
|
/**
|
|
* Pointer to an array of characters being parsed
|
|
*/
|
|
LPCTSTR m_lpszBuffer;
|
|
};
|
|
|
|
/**
|
|
* Returns the current value for the specified option.
|
|
*
|
|
* @param option - option to inquire
|
|
* @param bCurVal - this will receive the current value for the option.
|
|
*/
|
|
inline bool CLiteHTMLReader::getBoolOption(ReaderOptionsEnum option, bool& bCurVal) const
|
|
{
|
|
bool bSuccess = false;
|
|
|
|
switch (option)
|
|
{
|
|
case resolveEntities:
|
|
{
|
|
bCurVal = m_bResolveEntities;
|
|
bSuccess = true;
|
|
break;
|
|
}
|
|
default:
|
|
{
|
|
bSuccess = false;
|
|
break;
|
|
}
|
|
}
|
|
return (bSuccess);
|
|
}
|
|
|
|
/**
|
|
* Changes the value of an option and returns
|
|
* true/false indicating if the specified option
|
|
* was set successfully.
|
|
*
|
|
* @param option - option to change
|
|
* (one of the ReaderOptionsEnum constants)
|
|
* @param bNewVal - value to set
|
|
*
|
|
* @return true if option was set successfully; otherwise false.
|
|
*/
|
|
inline bool CLiteHTMLReader::setBoolOption(ReaderOptionsEnum option, bool bNewVal)
|
|
{
|
|
bool bSuccess = false;
|
|
|
|
switch (option)
|
|
{
|
|
case resolveEntities:
|
|
{
|
|
m_bResolveEntities = bNewVal;
|
|
bSuccess = true;
|
|
break;
|
|
}
|
|
default:
|
|
{
|
|
bSuccess = false;
|
|
break;
|
|
}
|
|
}
|
|
return (bSuccess);
|
|
}
|
|
|
|
/**
|
|
* Parses an HTML comment starting from the current buffer position.
|
|
*
|
|
* @param rComment - this will receive the comment (without delimeters)
|
|
*
|
|
* @return true if successful, false otherwise
|
|
*/
|
|
inline bool CLiteHTMLReader::parseComment(CString &rComment)
|
|
{
|
|
ATLASSERT(m_lpszBuffer != NULL);
|
|
ATLASSERT(m_dwBufPos >= 0L);
|
|
ATLASSERT(m_dwBufPos + 4 < m_dwBufLen);
|
|
|
|
// HTML comments begin with '<!' delimeter and
|
|
// are immediately followed by two hyphens '--'
|
|
if (::_tcsncmp(&m_lpszBuffer[m_dwBufPos], _T("<!--"), 4))
|
|
return (false);
|
|
|
|
LPCTSTR lpszBegin = &m_lpszBuffer[m_dwBufPos + 4];
|
|
// HTML comments end with two hyphen symbols '--'
|
|
LPCTSTR lpszEnd = ::_tcsstr(lpszBegin, _T("--"));
|
|
|
|
// comment ending delimeter could not be found?
|
|
if (lpszEnd == NULL)
|
|
// consider everything after current buffer position a comment
|
|
{
|
|
rComment = lpszBegin;
|
|
m_dwBufPos += (4 + rComment.GetLength());
|
|
return (true);
|
|
}
|
|
|
|
CString strComment(lpszBegin, int (lpszEnd - lpszBegin));
|
|
|
|
// end of buffer?
|
|
if (lpszEnd + 2 >= m_lpszBuffer + m_dwBufLen)
|
|
return (false);
|
|
|
|
// skip white-space characters after comment ending delimeter '--'
|
|
lpszEnd += 2;
|
|
while (::_istspace(*lpszEnd))
|
|
lpszEnd = ::_tcsinc(lpszEnd);
|
|
|
|
// comment has not been terminated properly
|
|
if (*lpszEnd != _T('>'))
|
|
return (false);
|
|
|
|
lpszEnd = ::_tcsinc(lpszEnd);
|
|
m_dwBufPos += (DWORD) (lpszEnd - &m_lpszBuffer[m_dwBufPos]);
|
|
rComment = strComment;
|
|
return (true);
|
|
}
|
|
|
|
/**
|
|
* Parses an HTML tag starting from the current buffer position.
|
|
*
|
|
* @param rTag - this will receive tag information (along with its attributes)
|
|
* @param bIsOpeningTag - receives true if the tag parsed is a opening tag.
|
|
* @param bIsClosingTag - receives true if the tag parsed is a closing tag.
|
|
*
|
|
* @return true if successful, false otherwise
|
|
*/
|
|
inline bool CLiteHTMLReader::parseTag(CLiteHTMLTag &rTag,
|
|
bool &bIsOpeningTag,
|
|
bool &bIsClosingTag,
|
|
TTagParsingMode &aTagParsingMode)
|
|
{
|
|
ATLASSERT(m_lpszBuffer != NULL);
|
|
ATLASSERT(m_dwBufPos >= 0L);
|
|
ATLASSERT(m_dwBufPos + 4 < m_dwBufLen);
|
|
|
|
UINT nRetVal = rTag.parseFromStr(&m_lpszBuffer[m_dwBufPos],
|
|
bIsOpeningTag, bIsClosingTag, aTagParsingMode, m_dwBufLen - m_dwBufPos);
|
|
if (!nRetVal)
|
|
return (false);
|
|
|
|
m_dwBufPos += nRetVal;
|
|
return (true);
|
|
}
|
|
|
|
#pragma warning(pop)
|
|
|