mirror of
https://github.com/ONLYOFFICE/core.git
synced 2026-02-10 18:05:41 +08:00
1500 lines
41 KiB
C++
1500 lines
41 KiB
C++
#include "HTMLReader.h"
|
||
|
||
#include "../Common/Network/FileTransporter/include/FileTransporter.h"
|
||
#include "../DesktopEditor/common/File.h"
|
||
#include "../DesktopEditor/common/Path.h"
|
||
|
||
#include "../Common/3dParty/html/htmltoxhtml.h"
|
||
|
||
#include "Common.h"
|
||
|
||
#include "Writers/OOXMLWriter.h"
|
||
#include "Tags/OOXMLTags.h"
|
||
|
||
#include "Writers/MDWriter.h"
|
||
#include "Tags/MDTags.h"
|
||
|
||
#include "../Common/3dParty/html/gumbo-parser/src/gumbo.h"
|
||
#include "src/StringFinder.h"
|
||
|
||
#include <boost/tuple/tuple.hpp>
|
||
|
||
namespace HTML
|
||
{
|
||
#define HTML_TAG(tag) GUMBO_TAG_##tag
|
||
#define ADD_TAG(strName, enumName) {strName, HTML_TAG(enumName)}
|
||
#define SKIP_TAG SCRIPT
|
||
#define UNKNOWN_TAG GumboTag::GUMBO_TAG_UNKNOWN
|
||
#define HtmlTag GumboTag
|
||
|
||
const static std::map<std::wstring, HtmlTag> m_HTML_TAGS
|
||
{
|
||
ADD_TAG(L"a", A),
|
||
ADD_TAG(L"abbr", ABBR),
|
||
ADD_TAG(L"acronym", ACRONYM),
|
||
ADD_TAG(L"address", ADDRESS),
|
||
ADD_TAG(L"applet", APPLET),
|
||
ADD_TAG(L"area", AREA),
|
||
ADD_TAG(L"article", ARTICLE),
|
||
ADD_TAG(L"aside", ASIDE),
|
||
ADD_TAG(L"audio", AUDIO),
|
||
ADD_TAG(L"b", B),
|
||
ADD_TAG(L"base", BASE),
|
||
ADD_TAG(L"basefont", BASEFONT),
|
||
ADD_TAG(L"bdi", BDI),
|
||
ADD_TAG(L"bdo", BDO),
|
||
ADD_TAG(L"bgsound", BGSOUND),
|
||
ADD_TAG(L"blockquote", BLOCKQUOTE),
|
||
ADD_TAG(L"big", BIG),
|
||
ADD_TAG(L"body", BODY),
|
||
ADD_TAG(L"blink", BLINK),
|
||
ADD_TAG(L"br", BR),
|
||
ADD_TAG(L"button", BUTTON),
|
||
ADD_TAG(L"canvas", CANVAS),
|
||
ADD_TAG(L"caption", CAPTION),
|
||
ADD_TAG(L"center", CENTER),
|
||
ADD_TAG(L"cite", CITE),
|
||
ADD_TAG(L"code", CODE),
|
||
ADD_TAG(L"col", COL),
|
||
ADD_TAG(L"colgroup", COLGROUP),
|
||
ADD_TAG(L"command", SKIP_TAG), // Данного обозначения нет, но т.к.мы всё равно пропускаем, то делаем script
|
||
ADD_TAG(L"comment", SKIP_TAG), // Данного обозначения нет, но т.к.мы всё равно пропускаем, то делаем script
|
||
ADD_TAG(L"datalist", DATALIST),
|
||
ADD_TAG(L"dd", DD),
|
||
ADD_TAG(L"del", DEL),
|
||
ADD_TAG(L"details", DETAILS),
|
||
ADD_TAG(L"dfn", DFN),
|
||
ADD_TAG(L"dir", DIR),
|
||
ADD_TAG(L"div", DIV),
|
||
ADD_TAG(L"dl", DL),
|
||
ADD_TAG(L"dt", DT),
|
||
ADD_TAG(L"em", EM),
|
||
ADD_TAG(L"embed", EMBED),
|
||
ADD_TAG(L"fieldset", FIELDSET),
|
||
ADD_TAG(L"figcaption", FIGCAPTION),
|
||
ADD_TAG(L"figure", FIGURE),
|
||
ADD_TAG(L"font", FONT),
|
||
ADD_TAG(L"form", FORM),
|
||
ADD_TAG(L"footer", FOOTER),
|
||
ADD_TAG(L"frame", FRAME),
|
||
ADD_TAG(L"frameset", FRAMESET),
|
||
ADD_TAG(L"h1", H1),
|
||
ADD_TAG(L"h2", H2),
|
||
ADD_TAG(L"h3", H3),
|
||
ADD_TAG(L"h4", H4),
|
||
ADD_TAG(L"h5", H5),
|
||
ADD_TAG(L"h6", H6),
|
||
ADD_TAG(L"head", HEAD),
|
||
ADD_TAG(L"header", HEADER),
|
||
ADD_TAG(L"hgroup", HGROUP),
|
||
ADD_TAG(L"hr", HR),
|
||
ADD_TAG(L"html", HTML),
|
||
ADD_TAG(L"i", I),
|
||
ADD_TAG(L"iframe", IFRAME),
|
||
ADD_TAG(L"img", IMG),
|
||
ADD_TAG(L"input", INPUT),
|
||
ADD_TAG(L"ins", INS),
|
||
ADD_TAG(L"isindex", ISINDEX),
|
||
ADD_TAG(L"kbd", KBD),
|
||
ADD_TAG(L"keygen", KEYGEN),
|
||
ADD_TAG(L"label", LABEL),
|
||
ADD_TAG(L"legend", LEGEND),
|
||
ADD_TAG(L"li", LI),
|
||
ADD_TAG(L"link", LINK),
|
||
ADD_TAG(L"main", MAIN),
|
||
ADD_TAG(L"map", MAP),
|
||
ADD_TAG(L"marquee", MARQUEE),
|
||
ADD_TAG(L"mark", MARK),
|
||
ADD_TAG(L"menu", MENU),
|
||
ADD_TAG(L"meta", META),
|
||
ADD_TAG(L"meter", METER),
|
||
ADD_TAG(L"nav", NAV),
|
||
ADD_TAG(L"nobr", NOBR),
|
||
ADD_TAG(L"noembed", NOEMBED),
|
||
ADD_TAG(L"noframes", NOFRAMES),
|
||
ADD_TAG(L"noscript", NOSCRIPT),
|
||
ADD_TAG(L"object", OBJECT),
|
||
ADD_TAG(L"ol", OL),
|
||
ADD_TAG(L"optgroup", OPTGROUP),
|
||
ADD_TAG(L"option", OPTION),
|
||
ADD_TAG(L"output", OUTPUT),
|
||
ADD_TAG(L"p", P),
|
||
ADD_TAG(L"param", PARAM),
|
||
ADD_TAG(L"plaintext", PLAINTEXT),
|
||
ADD_TAG(L"pre", PRE),
|
||
ADD_TAG(L"progress", PROGRESS),
|
||
ADD_TAG(L"q", Q),
|
||
ADD_TAG(L"rp", RP),
|
||
ADD_TAG(L"rt", RT),
|
||
ADD_TAG(L"ruby", RUBY),
|
||
ADD_TAG(L"s", S),
|
||
ADD_TAG(L"samp", SAMP),
|
||
ADD_TAG(L"script", SCRIPT),
|
||
ADD_TAG(L"section", SECTION),
|
||
ADD_TAG(L"select", SELECT),
|
||
ADD_TAG(L"small", SMALL),
|
||
ADD_TAG(L"span", SPAN),
|
||
ADD_TAG(L"source", SOURCE),
|
||
ADD_TAG(L"strike", STRIKE),
|
||
ADD_TAG(L"strong", STRONG),
|
||
ADD_TAG(L"style", STYLE),
|
||
ADD_TAG(L"sub", SUB),
|
||
ADD_TAG(L"summary", SUMMARY),
|
||
ADD_TAG(L"sup", SUP),
|
||
ADD_TAG(L"table", TABLE),
|
||
ADD_TAG(L"tbody", TBODY),
|
||
ADD_TAG(L"td", TD),
|
||
ADD_TAG(L"textarea", TEXTAREA),
|
||
ADD_TAG(L"tfoot", TFOOT),
|
||
ADD_TAG(L"th", TH),
|
||
ADD_TAG(L"thead", THEAD),
|
||
ADD_TAG(L"time", TIME),
|
||
ADD_TAG(L"title", TITLE),
|
||
ADD_TAG(L"tr", TR),
|
||
ADD_TAG(L"tt", TT),
|
||
ADD_TAG(L"u", U),
|
||
ADD_TAG(L"ul", UL),
|
||
ADD_TAG(L"var", VAR),
|
||
ADD_TAG(L"video", VIDEO),
|
||
ADD_TAG(L"wbr", WBR),
|
||
ADD_TAG(L"xmp", XMP),
|
||
|
||
ADD_TAG(L"svg", SVG)
|
||
};
|
||
|
||
bool HTML2XHTML(const std::wstring& wsFileName, XmlUtils::CXmlLiteReader& oLiteReader)
|
||
{
|
||
BYTE* pData;
|
||
DWORD nLength;
|
||
if (!NSFile::CFileBinary::ReadAllBytes(wsFileName, &pData, nLength))
|
||
return false;
|
||
|
||
std::string sFileContent = XmlUtils::GetUtf8FromFileContent(pData, nLength);
|
||
|
||
bool bNeedConvert = true;
|
||
if (nLength > 4)
|
||
{
|
||
if (pData[0] == 0xFF && pData[1] == 0xFE && !(pData[2] == 0x00 && pData[3] == 0x00))
|
||
bNeedConvert = false;
|
||
if (pData[0] == 0xFE && pData[1] == 0xFF)
|
||
bNeedConvert = false;
|
||
|
||
if (pData[0] == 0xFF && pData[1] == 0xFE && pData[2] == 0x00 && pData[3] == 0x00)
|
||
bNeedConvert = false;
|
||
if (pData[0] == 0 && pData[1] == 0 && pData[2] == 0xFE && pData[3] == 0xFF)
|
||
bNeedConvert = false;
|
||
}
|
||
|
||
RELEASEARRAYOBJECTS(pData);
|
||
|
||
size_t nFind = sFileContent.find("version=\"");
|
||
if(nFind != std::string::npos)
|
||
{
|
||
nFind += 9;
|
||
size_t nFindEnd = sFileContent.find("\"", nFind);
|
||
if(nFindEnd != std::string::npos)
|
||
sFileContent.replace(nFind, nFindEnd - nFind, "1.0");
|
||
}
|
||
|
||
const std::wstring sRes{htmlToXhtml(sFileContent, bNeedConvert)};
|
||
|
||
#ifdef SAVE_NORMALIZED_HTML
|
||
#if 1 == SAVE_NORMALIZED_HTML
|
||
NSFile::CFileBinary oWriter;
|
||
if (oWriter.CreateFileW(L"res.html"))
|
||
{
|
||
oWriter.WriteStringUTF8(sRes);
|
||
oWriter.CloseFile();
|
||
}
|
||
#endif
|
||
#endif
|
||
|
||
return oLiteReader.FromString(sRes);
|
||
}
|
||
|
||
bool MHT2XHTML(const std::wstring& wsFileName, XmlUtils::CXmlLiteReader& oLiteReader)
|
||
{
|
||
NSFile::CFileBinary file;
|
||
if (!file.OpenFile(wsFileName))
|
||
return false;
|
||
|
||
unsigned char* buffer = new unsigned char[4096];
|
||
if (!buffer)
|
||
{
|
||
file.CloseFile();
|
||
return false;
|
||
}
|
||
|
||
DWORD dwReadBytes = 0;
|
||
file.ReadFile(buffer, 4096, dwReadBytes);
|
||
file.CloseFile();
|
||
std::string xml_string = XmlUtils::GetUtf8FromFileContent(buffer, dwReadBytes);
|
||
|
||
const std::string sContentType = NSStringFinder::FindProperty(xml_string, "content-type", ":", ";");
|
||
bool bRes = false;
|
||
|
||
if(NSStringFinder::Equals(sContentType, "multipart/related"))
|
||
{
|
||
BYTE* pData;
|
||
DWORD nLength;
|
||
if (!NSFile::CFileBinary::ReadAllBytes(wsFileName, &pData, nLength))
|
||
return false;
|
||
|
||
std::string sFileContent = XmlUtils::GetUtf8FromFileContent(pData, nLength);
|
||
RELEASEARRAYOBJECTS(pData);
|
||
const std::wstring sRes = mhtToXhtml(sFileContent);
|
||
bRes = oLiteReader.FromString(sRes);
|
||
}
|
||
else
|
||
bRes = HTML2XHTML(wsFileName, oLiteReader);
|
||
|
||
RELEASEARRAYOBJECTS(buffer);
|
||
return bRes;
|
||
}
|
||
|
||
inline std::wstring GetArgumentValue(XmlUtils::CXmlLiteReader& oLiteReader, const std::wstring& wsArgumentName, const std::wstring& wsDefaultValue = L"");
|
||
inline bool CheckArgumentMath(const std::wstring& wsNodeName, const std::wstring& wsStyleName);
|
||
inline HtmlTag GetHtmlTag(const std::wstring& wsStrTag);
|
||
inline bool UnreadableNode(const std::wstring& wsNodeName);
|
||
inline bool TagIsUnprocessed(const std::wstring& wsTagName);
|
||
|
||
CHTMLReader::CHTMLReader()
|
||
: m_pWriter(nullptr)
|
||
{}
|
||
|
||
CHTMLReader::~CHTMLReader()
|
||
{
|
||
if (nullptr != m_pWriter)
|
||
delete m_pWriter;
|
||
}
|
||
|
||
void CHTMLReader::SetTempDirectory(const std::wstring& wsPath)
|
||
{
|
||
m_wsTempDirectory = wsPath;
|
||
}
|
||
|
||
void CHTMLReader::SetCoreDirectory(const std::wstring& wsPath)
|
||
{
|
||
m_wsCoreDirectory = wsPath;
|
||
}
|
||
|
||
HRESULT CHTMLReader::ConvertHTML2OOXML(const std::wstring& wsPath, const std::wstring& wsDirectory, THTMLParameters* pParameters)
|
||
{
|
||
return InitAndConvert2OOXML({wsPath}, wsDirectory, HTML2XHTML, pParameters);
|
||
}
|
||
|
||
HRESULT CHTMLReader::ConvertHTML2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters)
|
||
{
|
||
return InitAndConvert2Markdown({wsPath}, wsFinalFile, HTML2XHTML, pParameters);
|
||
}
|
||
|
||
HRESULT CHTMLReader::ConvertHTML2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, THTMLParameters* pParameters)
|
||
{
|
||
return InitAndConvert2OOXML(arPaths, wsDirectory, HTML2XHTML, pParameters);
|
||
}
|
||
|
||
HRESULT CHTMLReader::ConvertHTML2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters)
|
||
{
|
||
return InitAndConvert2Markdown(arPaths, wsFinalFile, HTML2XHTML, pParameters);
|
||
}
|
||
|
||
HRESULT CHTMLReader::ConvertMHT2OOXML(const std::wstring& wsPath, const std::wstring& wsDirectory, THTMLParameters* pParameters)
|
||
{
|
||
return InitAndConvert2OOXML({wsPath}, wsDirectory, MHT2XHTML, pParameters);
|
||
}
|
||
|
||
HRESULT CHTMLReader::ConvertMHT2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters)
|
||
{
|
||
return InitAndConvert2Markdown({wsPath}, wsFinalFile, MHT2XHTML, pParameters);
|
||
}
|
||
|
||
HRESULT CHTMLReader::ConvertMHT2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, THTMLParameters* pParameters)
|
||
{
|
||
return InitAndConvert2OOXML(arPaths, wsDirectory, MHT2XHTML, pParameters);
|
||
}
|
||
|
||
HRESULT CHTMLReader::ConvertMHT2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters)
|
||
{
|
||
return InitAndConvert2Markdown(arPaths, wsFinalFile, MHT2XHTML, pParameters);
|
||
}
|
||
|
||
void CHTMLReader::Clear()
|
||
{
|
||
if (nullptr != m_pWriter)
|
||
delete m_pWriter;
|
||
|
||
m_mTags.clear();
|
||
|
||
m_wsTempDirectory.clear();
|
||
m_wsSrcDirectory .clear();
|
||
m_wsDstDirectory .clear();
|
||
m_wsBaseDirectory.clear();
|
||
m_wsCoreDirectory.clear();
|
||
}
|
||
|
||
void CHTMLReader::InitOOXMLTags(THTMLParameters* pParametrs)
|
||
{
|
||
Clear();
|
||
|
||
COOXMLWriter *pWriter = new COOXMLWriter(pParametrs, &m_oCSSCalculator);
|
||
|
||
if (nullptr == pWriter)
|
||
return;
|
||
|
||
pWriter->SetSrcDirectory (m_wsSrcDirectory);
|
||
pWriter->SetDstDirectory (m_wsDstDirectory);
|
||
pWriter->SetTempDirectory(m_wsTempDirectory);
|
||
pWriter->SetBaseDirectory(m_wsBaseDirectory);
|
||
pWriter->SetCoreDirectory(m_wsCoreDirectory);
|
||
|
||
m_pWriter = pWriter;
|
||
|
||
m_mTags[HTML_TAG(A)] = std::make_shared<CAnchor <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(ABBR)] = std::make_shared<CAnchor <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(BR)] = std::make_shared<CBreak <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(DIV)] = std::make_shared<CDivision <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(IMG)] = std::make_shared<CImage <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(FONT)] = std::make_shared<CFont <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(INPUT)] = std::make_shared<CInput <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(BASEFONT)] = std::make_shared<CBaseFont <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(BLOCKQUOTE)] = std::make_shared<CBlockquote <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(HR)] = std::make_shared<CHorizontalRule<COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(OL)] = std::make_shared<CList <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(LI)] = std::make_shared<CListElement <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(CAPTION)] = std::make_shared<CCaption <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(TABLE)] = std::make_shared<CTable <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(TR)] = std::make_shared<CTableRow <COOXMLWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(TD)] = std::make_shared<CTableCell <COOXMLWriter>>(pWriter);
|
||
|
||
std::shared_ptr<ITag> oIgnoredTag{std::make_shared<CEmptyTag>()};
|
||
|
||
m_mTags[HTML_TAG(B)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(I)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(PRE)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(CENTER)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(KBD)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(S)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(U)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(MARK)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(SUP)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(DD)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(Q)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(BDO)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(SPAN)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(H1)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(CODE)] = oIgnoredTag;
|
||
}
|
||
|
||
void CHTMLReader::InitMDTags(TMarkdownParameters* pParametrs)
|
||
{
|
||
CMDWriter *pWriter = new CMDWriter((nullptr != pParametrs) ? *pParametrs : TMarkdownParameters{});
|
||
|
||
if (nullptr == pWriter)
|
||
return;
|
||
|
||
m_pWriter = pWriter;
|
||
|
||
m_mTags[HTML_TAG(A)] = std::make_shared<CAnchor <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(B)] = std::make_shared<CBold <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(BR)] = std::make_shared<CBreak <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(I)] = std::make_shared<CItalic <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(S)] = std::make_shared<CStrike <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(Q)] = std::make_shared<CQuotation <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(H1)] = std::make_shared<CHeader <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(IMG)] = std::make_shared<CImage <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(HR)] = std::make_shared<CHorizontalRule<CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(BLOCKQUOTE)] = std::make_shared<CBlockquote <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(TABLE)] = std::make_shared<CTable <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(TR)] = std::make_shared<CTableRow <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(TD)] = std::make_shared<CTableCell <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(OL)] = std::make_shared<CList <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(LI)] = std::make_shared<CListElement <CMDWriter>>(pWriter);
|
||
m_mTags[HTML_TAG(PRE)] = std::make_shared<CPreformatted <CMDWriter>>(pWriter);
|
||
|
||
std::shared_ptr<ITag> oCode{std::make_shared<CCode<CMDWriter>>(pWriter)};
|
||
|
||
m_mTags[HTML_TAG(CODE)] = oCode;
|
||
m_mTags[HTML_TAG(KBD)] = oCode;
|
||
|
||
std::shared_ptr<ITag> oIgnoredTag{std::make_shared<CEmptyTag>()};
|
||
|
||
m_mTags[HTML_TAG(ABBR)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(DIV)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(FONT)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(INPUT)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(BASEFONT)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(CENTER)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(MARK)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(SUP)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(DD)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(BDO)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(SPAN)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(CAPTION)] = oIgnoredTag;
|
||
m_mTags[HTML_TAG(U)] = oIgnoredTag;
|
||
}
|
||
|
||
bool CHTMLReader::IsHTML()
|
||
{
|
||
return ((m_oLightReader.MoveToStart() && m_oLightReader.ReadNextNode()) ? m_oLightReader.GetName() == L"html" : false);
|
||
}
|
||
|
||
HRESULT CHTMLReader::InitAndConvert2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, Convert_Func Convertation, THTMLParameters* pParameters)
|
||
{
|
||
InitOOXMLTags(pParameters);
|
||
m_wsDstDirectory = wsDirectory;
|
||
|
||
HRESULT lResult{S_FALSE};
|
||
|
||
m_pWriter->Begin(wsDirectory);
|
||
|
||
for (const std::wstring& wsPath : arPaths)
|
||
{
|
||
if (Convert(wsPath, Convertation))
|
||
{
|
||
lResult = S_OK;
|
||
|
||
if (nullptr != pParameters && pParameters->m_bNeedPageBreakBefore)
|
||
m_pWriter->PageBreak();
|
||
}
|
||
}
|
||
|
||
m_pWriter->End(wsDirectory);
|
||
|
||
return lResult;
|
||
}
|
||
|
||
HRESULT CHTMLReader::InitAndConvert2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, Convert_Func Convertation, TMarkdownParameters* pParameters)
|
||
{
|
||
InitMDTags(pParameters);
|
||
|
||
HRESULT lResult{S_FALSE};
|
||
|
||
m_pWriter->Begin(L"");
|
||
|
||
for (const std::wstring& wsPath : arPaths)
|
||
{
|
||
if (Convert(wsPath, Convertation))
|
||
lResult = S_OK;
|
||
}
|
||
|
||
m_pWriter->End(wsFinalFile);
|
||
|
||
return lResult;
|
||
}
|
||
|
||
bool CHTMLReader::Convert(const std::wstring& wsPath, Convert_Func Convertation)
|
||
{
|
||
if (nullptr == m_pWriter || !Convertation(wsPath, m_oLightReader) || !m_oLightReader.IsValid() || !IsHTML())
|
||
return false;
|
||
|
||
m_wsSrcDirectory = NSSystemPath::GetDirectoryName(wsPath);
|
||
|
||
m_oLightReader.MoveToStart();
|
||
m_oLightReader.ReadNextNode();
|
||
ReadStyle();
|
||
|
||
// Переходим в начало
|
||
if(!m_oLightReader.MoveToStart())
|
||
return S_FALSE;
|
||
|
||
ReadDocument();
|
||
|
||
return true;
|
||
}
|
||
|
||
void CHTMLReader::ReadStyle()
|
||
{
|
||
if(m_oLightReader.IsEmptyNode())
|
||
return;
|
||
|
||
const int nDeath = m_oLightReader.GetDepth();
|
||
std::wstring sName;
|
||
|
||
while(m_oLightReader.ReadNextSiblingNode(nDeath))
|
||
{
|
||
sName = m_oLightReader.GetName();
|
||
|
||
if(sName == L"body")
|
||
ReadStyle2();
|
||
else
|
||
{
|
||
// Стиль по ссылке
|
||
if(sName == L"link")
|
||
{
|
||
while(m_oLightReader.MoveToNextAttribute())
|
||
ReadStyleFromNetwork();
|
||
|
||
m_oLightReader.MoveToElement();
|
||
}
|
||
// тэг style содержит стили для styles.xml
|
||
else if(sName == L"style")
|
||
m_oCSSCalculator.AddStyles(m_oLightReader.GetText2());
|
||
else
|
||
ReadStyle();
|
||
}
|
||
}
|
||
}
|
||
|
||
void CHTMLReader::ReadStyle2()
|
||
{
|
||
const std::wstring wsName = m_oLightReader.GetName();
|
||
// Стиль по ссылке
|
||
if(wsName == L"link")
|
||
{
|
||
while(m_oLightReader.MoveToNextAttribute())
|
||
ReadStyleFromNetwork();
|
||
m_oLightReader.MoveToElement();
|
||
}
|
||
// тэг style содержит стили для styles.xml
|
||
else if(wsName == L"style")
|
||
m_oCSSCalculator.AddStyles(m_oLightReader.GetText2());
|
||
|
||
const int nDeath = m_oLightReader.GetDepth();
|
||
while(m_oLightReader.ReadNextSiblingNode(nDeath))
|
||
{
|
||
if(!m_oLightReader.IsEmptyNode())
|
||
ReadStyle2();
|
||
}
|
||
}
|
||
|
||
void CHTMLReader::ReadStyleFromNetwork()
|
||
{
|
||
if(m_oLightReader.GetName() != L"href")
|
||
return;
|
||
std::wstring sRef = m_oLightReader.GetText();
|
||
if(NSFile::GetFileExtention(sRef) != L"css")
|
||
return;
|
||
std::wstring sFName = NSFile::GetFileName(sRef);
|
||
// Стиль в сети
|
||
if(sRef.substr(0, 4) == L"http")
|
||
{
|
||
sFName = m_wsTempDirectory + L'/' + sFName;
|
||
NSNetwork::NSFileTransport::CFileDownloader oDownloadStyle(sRef, false);
|
||
oDownloadStyle.SetFilePath(sFName);
|
||
if(oDownloadStyle.DownloadSync())
|
||
{
|
||
m_oCSSCalculator.AddStylesFromFile(sFName);
|
||
NSFile::CFileBinary::Remove(sFName);
|
||
}
|
||
}
|
||
else
|
||
{
|
||
m_oCSSCalculator.AddStylesFromFile(m_wsSrcDirectory + L'/' + sFName);
|
||
m_oCSSCalculator.AddStylesFromFile(m_wsSrcDirectory + L'/' + sRef);
|
||
}
|
||
}
|
||
|
||
void CHTMLReader::ReadDocument()
|
||
{
|
||
m_oLightReader.ReadNextNode();
|
||
|
||
int nDeath = m_oLightReader.GetDepth();
|
||
while(m_oLightReader.ReadNextSiblingNode(nDeath))
|
||
{
|
||
const std::wstring wsName = m_oLightReader.GetName();
|
||
if(wsName == L"head")
|
||
ReadHead();
|
||
else if(wsName == L"body")
|
||
ReadBody();
|
||
}
|
||
}
|
||
|
||
void CHTMLReader::ReadHead()
|
||
{
|
||
if(m_oLightReader.IsEmptyNode())
|
||
return;
|
||
int nDeath = m_oLightReader.GetDepth();
|
||
while (m_oLightReader.ReadNextSiblingNode(nDeath))
|
||
{
|
||
const std::wstring wsName = m_oLightReader.GetName();
|
||
// Базовый адрес
|
||
if (L"base" == wsName)
|
||
m_wsBaseDirectory = GetArgumentValue(m_oLightReader, L"href");
|
||
}
|
||
|
||
m_oLightReader.MoveToElement();
|
||
}
|
||
|
||
void CHTMLReader::ReadBody()
|
||
{
|
||
std::vector<NSCSS::CNode> arSelectors;
|
||
|
||
arSelectors.push_back(NSCSS::CNode(L"html", L"", L""));
|
||
|
||
GetSubClass(arSelectors);
|
||
|
||
if (!arSelectors.back().m_mAttributes.empty())
|
||
{
|
||
std::map<std::wstring, std::wstring>::iterator itFound = arSelectors.back().m_mAttributes.find(L"bgcolor");
|
||
|
||
if (arSelectors.back().m_mAttributes.end() != itFound)
|
||
{
|
||
NSCSS::NSProperties::CColor oColor;
|
||
oColor.SetValue(itFound->second);
|
||
|
||
if (!oColor.Empty() && !oColor.None())
|
||
{
|
||
const std::wstring wsHEXColor{oColor.ToHEX()};
|
||
|
||
if (!wsHEXColor.empty())
|
||
m_pWriter->GetCurrentDocument()->WriteString(L"<w:background w:color=\"" + wsHEXColor + L"\"/>");
|
||
|
||
arSelectors.back().m_mAttributes.erase(itFound);
|
||
}
|
||
}
|
||
}
|
||
|
||
m_oLightReader.MoveToElement();
|
||
|
||
ReadStream(arSelectors);
|
||
}
|
||
|
||
bool CHTMLReader::ReadStream(std::vector<NSCSS::CNode>& arSelectors, bool bInsertEmptyP)
|
||
{
|
||
if (nullptr == m_pWriter)
|
||
return false;
|
||
|
||
bool bResult{false};
|
||
|
||
const int nDeath = m_oLightReader.GetDepth();
|
||
if(m_oLightReader.IsEmptyNode() || !m_oLightReader.ReadNextSiblingNode2(nDeath))
|
||
{
|
||
if (!bInsertEmptyP)
|
||
return false;
|
||
|
||
m_pWriter->WriteEmptyParagraph();
|
||
return true;
|
||
}
|
||
|
||
do
|
||
{
|
||
if (ReadInside(arSelectors))
|
||
bResult = true;
|
||
} while(m_oLightReader.ReadNextSiblingNode2(nDeath));
|
||
|
||
if (!bResult && bInsertEmptyP)
|
||
m_pWriter->WriteEmptyParagraph();
|
||
|
||
return bResult;
|
||
}
|
||
|
||
bool CHTMLReader::ReadInside(std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
const std::wstring wsName{m_oLightReader.GetName()};
|
||
|
||
if(wsName == L"#text")
|
||
return ReadText(arSelectors);
|
||
|
||
//TODO:: обработать все варианты return'а
|
||
if (UnreadableNode(wsName) || TagIsUnprocessed(wsName))
|
||
return false;
|
||
|
||
GetSubClass(arSelectors);
|
||
|
||
bool bResult = true;
|
||
|
||
const HtmlTag eHtmlTag{GetHtmlTag(wsName)};
|
||
|
||
switch(eHtmlTag)
|
||
{
|
||
case HTML_TAG(A):
|
||
case HTML_TAG(AREA):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(A), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(ABBR):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(ABBR), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(B):
|
||
case HTML_TAG(STRONG):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(B), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(BDO):
|
||
case HTML_TAG(BDI):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(BDO), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(BR):
|
||
{
|
||
bResult = ReadEmptyTag(HTML_TAG(BR), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(CENTER):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(CENTER), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(CITE):
|
||
case HTML_TAG(DFN):
|
||
case HTML_TAG(EM):
|
||
case HTML_TAG(I):
|
||
case HTML_TAG(VAR):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(I), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(CODE):
|
||
case HTML_TAG(SAMP):
|
||
case HTML_TAG(TT):
|
||
case HTML_TAG(OUTPUT):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(CODE), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(KBD):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(KBD), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(DEL):
|
||
case HTML_TAG(S):
|
||
case HTML_TAG(STRIKE):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(S), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(FONT):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(FONT), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(IMG):
|
||
|
||
{
|
||
bResult = ReadEmptyTag(HTML_TAG(IMG), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(SVG):
|
||
{
|
||
bResult = ReadSVG(arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(INS):
|
||
case HTML_TAG(U):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(U), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(MARK):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(MARK), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(Q):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(Q), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(SUP):
|
||
case HTML_TAG(SUB):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(SUP), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(INPUT):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(INPUT), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(CANVAS):
|
||
case HTML_TAG(VIDEO):
|
||
case HTML_TAG(MATH):
|
||
case HTML_TAG(IFRAME):
|
||
case HTML_TAG(EMBED):
|
||
case HTML_TAG(WBR):
|
||
case HTML_TAG(AUDIO):
|
||
case HTML_TAG(BGSOUND):
|
||
case HTML_TAG(APPLET):
|
||
case HTML_TAG(BLINK):
|
||
case HTML_TAG(KEYGEN):
|
||
case HTML_TAG(TITLE):
|
||
case HTML_TAG(STYLE):
|
||
case HTML_TAG(SCRIPT):
|
||
{
|
||
//Если встретили не обрабатываемые теги, то просто пропускаем
|
||
arSelectors.pop_back();
|
||
return false;
|
||
}
|
||
case HTML_TAG(SPAN):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(SPAN), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(NOBR):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(PRE), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(BASEFONT):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(BASEFONT), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(BUTTON):
|
||
case HTML_TAG(LABEL):
|
||
case HTML_TAG(DATA):
|
||
case HTML_TAG(OBJECT):
|
||
case HTML_TAG(NOSCRIPT):
|
||
case HTML_TAG(TIME):
|
||
case HTML_TAG(SMALL):
|
||
case HTML_TAG(PROGRESS):
|
||
case HTML_TAG(HGROUP):
|
||
case HTML_TAG(METER):
|
||
case HTML_TAG(ACRONYM):
|
||
case HTML_TAG(BIG):
|
||
{
|
||
bResult = ReadStream(arSelectors);
|
||
break;
|
||
}
|
||
default:
|
||
{
|
||
m_pWriter->BeginBlock();
|
||
|
||
switch(eHtmlTag)
|
||
{
|
||
case HTML_TAG(ADDRESS):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(I), arSelectors);;
|
||
break;
|
||
}
|
||
case HTML_TAG(DD):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(DD), arSelectors);;
|
||
break;
|
||
}
|
||
case HTML_TAG(H1):
|
||
case HTML_TAG(H2):
|
||
case HTML_TAG(H3):
|
||
case HTML_TAG(H4):
|
||
case HTML_TAG(H5):
|
||
case HTML_TAG(H6):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(H1), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(ASIDE):
|
||
case HTML_TAG(DIV):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(DIV), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(BLOCKQUOTE):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(BLOCKQUOTE), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(ARTICLE):
|
||
case HTML_TAG(HEADER):
|
||
case HTML_TAG(MAIN):
|
||
case HTML_TAG(SUMMARY):
|
||
case HTML_TAG(FOOTER):
|
||
case HTML_TAG(NAV):
|
||
case HTML_TAG(FIGCAPTION):
|
||
case HTML_TAG(FORM):
|
||
case HTML_TAG(OPTION):
|
||
case HTML_TAG(DT):
|
||
case HTML_TAG(P):
|
||
case HTML_TAG(SECTION):
|
||
case HTML_TAG(FIGURE):
|
||
case HTML_TAG(DL):
|
||
case HTML_TAG(LEGEND):
|
||
case HTML_TAG(MAP):
|
||
{
|
||
bResult = ReadStream(arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(HR):
|
||
{
|
||
bResult = ReadEmptyTag(HTML_TAG(HR), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(LI):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(LI), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(OL):
|
||
case HTML_TAG(UL):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(OL), arSelectors);
|
||
break;
|
||
}
|
||
// case HTML_TAG(MENU):
|
||
// case HTML_TAG(SELECT):
|
||
// case HTML_TAG(DATALIST):
|
||
// case HTML_TAG(DIR):
|
||
// {
|
||
// bResult = readLi(&oXmlData, sSelectors, oTS, HTML_TAG(OL) != eHtmlTag);
|
||
// break;
|
||
// }
|
||
case HTML_TAG(PRE):
|
||
case HTML_TAG(XMP):
|
||
{
|
||
bResult = ReadDefaultTag(HTML_TAG(PRE), arSelectors);
|
||
break;
|
||
}
|
||
case HTML_TAG(TABLE):
|
||
{
|
||
bResult = ReadTable(arSelectors);
|
||
break;
|
||
}
|
||
// case HTML_TAG(RUBY):
|
||
// {
|
||
// bResult = ParseRuby(&oXmlData, sSelectors, oTS);
|
||
// break;
|
||
// }
|
||
// case HTML_TAG(TEXTAREA):
|
||
// case HTML_TAG(FIELDSET):
|
||
// {
|
||
// bResult = ReadStream(arSelectors);
|
||
// break;
|
||
// }
|
||
// case HTML_TAG(DETAILS):
|
||
// {
|
||
// bResult = ReadDetails(&oXmlData, sSelectors, oTS);
|
||
// break;
|
||
// }
|
||
default:
|
||
{
|
||
bResult = ReadStream(arSelectors);
|
||
break;
|
||
}
|
||
}
|
||
|
||
m_pWriter->EndBlock(bResult);
|
||
}
|
||
}
|
||
|
||
arSelectors.pop_back();
|
||
return bResult;
|
||
}
|
||
|
||
bool CHTMLReader::ReadText(std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
if (nullptr == m_pWriter)
|
||
return false;
|
||
|
||
GetSubClass(arSelectors);
|
||
|
||
const bool bResult{m_pWriter->WriteText(m_oLightReader.GetText(), arSelectors)};
|
||
|
||
arSelectors.pop_back();
|
||
|
||
return bResult;
|
||
}
|
||
|
||
bool CHTMLReader::ReadSVG(const std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
if (!m_mTags[HTML_TAG(IMG)]->Open(arSelectors, m_oLightReader.GetOuterXml()))
|
||
return false;
|
||
|
||
m_mTags[HTML_TAG(IMG)]->Close(arSelectors);
|
||
|
||
return true;
|
||
}
|
||
|
||
bool CHTMLReader::ReadTable(std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
if(m_oLightReader.IsEmptyNode())
|
||
return false;
|
||
|
||
CStorageTable oTable;
|
||
|
||
NSCSS::CCompiledStyle *pStyle = arSelectors.back().m_pCompiledStyle;
|
||
|
||
//Table styles
|
||
std::wstring wsFrame;
|
||
std::wstring wsValue;
|
||
|
||
if (arSelectors.back().GetAttributeValue(L"border", wsValue))
|
||
{
|
||
const int nWidth = NSStringFinder::ToInt(wsValue);
|
||
|
||
if (0 < nWidth)
|
||
{
|
||
oTable.SetRules(L"all");
|
||
|
||
if (pStyle->m_oBorder.Empty())
|
||
{
|
||
pStyle->m_oBorder.SetStyle(L"outset", 0, true);
|
||
pStyle->m_oBorder.SetWidth(nWidth, NSCSS::UnitMeasure::Point, 0, true);
|
||
pStyle->m_oBorder.SetColor(L"auto", 0, true);
|
||
}
|
||
}
|
||
else if (pStyle->m_oBorder.Empty())
|
||
{
|
||
pStyle->m_oBorder.SetNone(0, true);
|
||
oTable.SetRules(L"none");
|
||
}
|
||
}
|
||
|
||
if (arSelectors.back().GetAttributeValue(L"cellpadding", wsValue))
|
||
pStyle->m_oPadding.SetValues(wsValue + L"px", 0, true);
|
||
|
||
if (arSelectors.back().GetAttributeValue(L"rules", wsValue))
|
||
oTable.SetRules(wsValue);
|
||
|
||
arSelectors.back().GetAttributeValue(L"frame", wsFrame);
|
||
|
||
if (!wsFrame.empty() && pStyle->m_oBorder.Empty())
|
||
{
|
||
#define SetDefaultBorderSide(side) \
|
||
pStyle->m_oBorder.SetStyle##side(L"solid", 0, true); \
|
||
pStyle->m_oBorder.SetWidth##side(1, NSCSS::UnitMeasure::Point, 0, true); \
|
||
pStyle->m_oBorder.SetColor##side(L"black", 0, true)
|
||
|
||
if (NSStringFinder::Equals(L"border", wsFrame))
|
||
{
|
||
SetDefaultBorderSide();
|
||
}
|
||
else if (NSStringFinder::Equals(L"above", wsFrame))
|
||
{
|
||
SetDefaultBorderSide(TopSide);
|
||
}
|
||
else if (NSStringFinder::Equals(L"below", wsFrame))
|
||
{
|
||
SetDefaultBorderSide(BottomSide);
|
||
}
|
||
else if (NSStringFinder::Equals(L"hsides", wsFrame))
|
||
{
|
||
SetDefaultBorderSide(TopSide);
|
||
SetDefaultBorderSide(BottomSide);
|
||
}
|
||
else if (NSStringFinder::Equals(L"vsides", wsFrame))
|
||
{
|
||
SetDefaultBorderSide(LeftSide);
|
||
SetDefaultBorderSide(RightSide);
|
||
}
|
||
else if (NSStringFinder::Equals(L"rhs", wsFrame))
|
||
{
|
||
SetDefaultBorderSide(RightSide);
|
||
}
|
||
else if (NSStringFinder::Equals(L"lhs", wsFrame))
|
||
{
|
||
SetDefaultBorderSide(LeftSide);
|
||
}
|
||
}
|
||
|
||
if (pStyle->m_oBorder.GetCollapse() == NSCSS::NSProperties::BorderCollapse::Collapse)
|
||
oTable.SetCellSpacing(0);
|
||
else if (arSelectors.back().GetAttributeValue(L"cellspacing", wsValue))
|
||
oTable.SetCellSpacing(NSStringFinder::ToInt(wsValue));
|
||
else if (pStyle->m_oBorder.GetCollapse() == NSCSS::NSProperties::BorderCollapse::Separate)
|
||
oTable.SetCellSpacing(15);
|
||
|
||
oTable.SetWidth(pStyle->m_oDisplay.GetWidth());
|
||
oTable.SetBorder(pStyle->m_oBorder);
|
||
oTable.SetPadding(pStyle->m_oPadding);
|
||
oTable.SetMargin(pStyle->m_oMargin);
|
||
oTable.SetAlign(pStyle->m_oDisplay.GetHAlign().ToWString());
|
||
//------
|
||
|
||
int nDeath = m_oLightReader.GetDepth();
|
||
while(m_oLightReader.ReadNextSiblingNode(nDeath))
|
||
{
|
||
const std::wstring sName = m_oLightReader.GetName();
|
||
GetSubClass(arSelectors);
|
||
|
||
if(sName == L"caption")
|
||
ReadTableCaption(oTable, arSelectors);
|
||
if(sName == L"thead")
|
||
ReadTableRows(oTable, arSelectors, ERowParseMode::Header);
|
||
if(sName == L"tbody")
|
||
ReadTableRows(oTable, arSelectors, ERowParseMode::Body);
|
||
else if(sName == L"tfoot")
|
||
ReadTableRows(oTable, arSelectors, ERowParseMode::Foother);
|
||
else if (sName == L"colgroup")
|
||
ReadTableColspan(oTable);
|
||
|
||
arSelectors.pop_back();
|
||
}
|
||
|
||
oTable.Shorten();
|
||
oTable.CompleteTable();
|
||
|
||
if (!m_mTags[HTML_TAG(TABLE)]->Open(arSelectors, &oTable))
|
||
return false;
|
||
|
||
#define CONVERT_ROWS(rows, parse_mode)\
|
||
{\
|
||
const std::vector<CStorageTableRow*> arRows{rows};\
|
||
\
|
||
for (UINT unRow = 0; unRow < arRows.size(); ++unRow)\
|
||
{\
|
||
ERowPosition eRowPosition{ERowPosition::Middle};\
|
||
\
|
||
if (0 == unRow)\
|
||
eRowPosition = ERowPosition::First;\
|
||
else if (arRows.size() - 1 == unRow)\
|
||
eRowPosition = ERowPosition::Last;\
|
||
\
|
||
if (!m_mTags[HTML_TAG(TR)]->Open(arSelectors, boost::tuple<const TTableRowStyle*, const CStorageTable&, ERowParseMode, ERowPosition>(&arRows[unRow]->GetStyles(), oTable, parse_mode, eRowPosition)))\
|
||
continue;\
|
||
\
|
||
const std::vector<CStorageTableCell*>& arCells{arRows[unRow]->GetCells()};\
|
||
\
|
||
for (UINT unCol = 0; unCol < arCells.size(); ++unCol)\
|
||
{\
|
||
m_mTags[HTML_TAG(TD)]->Open(arSelectors, boost::tuple<const CStorageTableCell&, const CStorageTable&, UINT, ERowParseMode, ERowPosition>(*arCells[unCol], oTable, unCol, parse_mode, eRowPosition));\
|
||
\
|
||
if (0 != arCells[unCol]->GetData()->GetCurSize())\
|
||
WriteToStringBuilder(*(arCells[unCol]->GetData()), *(m_pWriter->GetCurrentDocument()));\
|
||
else\
|
||
m_pWriter->WriteEmptyParagraph();\
|
||
\
|
||
m_mTags[HTML_TAG(TD)]->Close(arSelectors);\
|
||
}\
|
||
\
|
||
m_mTags[HTML_TAG(TR)]->Close(arSelectors);\
|
||
}}
|
||
|
||
if (!oTable.HaveHeader())
|
||
{
|
||
if (m_mTags[HTML_TAG(TR)]->Open(arSelectors, boost::tuple<const TTableRowStyle*, const CStorageTable&, ERowParseMode, ERowPosition>(nullptr, oTable, ERowParseMode::Header, ERowPosition::First)))
|
||
m_mTags[HTML_TAG(TR)]->Close(arSelectors);
|
||
}
|
||
else
|
||
{
|
||
for (const std::vector<CStorageTableRow*>& arHeader : oTable.GetHeaders())
|
||
CONVERT_ROWS(arHeader, ERowParseMode::Header)
|
||
}
|
||
|
||
CONVERT_ROWS(oTable.GetRows(), ERowParseMode::Body)
|
||
CONVERT_ROWS(oTable.GetFoothers(), ERowParseMode::Foother)
|
||
|
||
m_mTags[HTML_TAG(TABLE)]->Close(arSelectors);
|
||
|
||
return true;
|
||
}
|
||
|
||
void CHTMLReader::ReadTableCaption(CStorageTable& oTable, std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
if (nullptr == m_pWriter)
|
||
return;
|
||
|
||
GetSubClass(arSelectors);
|
||
m_pWriter->SetDataOutput(oTable.GetCaptionData());
|
||
|
||
arSelectors.back().m_pCompiledStyle->m_oDisplay.SetVAlign(L"center", arSelectors.size());
|
||
|
||
ReadDefaultTag(HTML_TAG(CAPTION), arSelectors);
|
||
|
||
m_pWriter->RevertDataOutput();
|
||
arSelectors.pop_back();
|
||
}
|
||
|
||
void CalculateCellStyles(TTableCellStyle* pCellStyle, std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
if (NULL == pCellStyle)
|
||
return;
|
||
|
||
pCellStyle->m_wsVAlign = arSelectors.back().m_pCompiledStyle->m_oDisplay.GetVAlign().ToWString();
|
||
pCellStyle->m_wsHAlign = arSelectors.back().m_pCompiledStyle->m_oDisplay.GetHAlign().ToWString();
|
||
pCellStyle->m_oBackground = arSelectors.back().m_pCompiledStyle->m_oBackground.GetColor();
|
||
pCellStyle->m_oHeight = arSelectors.back().m_pCompiledStyle->m_oDisplay.GetHeight();
|
||
pCellStyle->m_oWidth = arSelectors.back().m_pCompiledStyle->m_oDisplay.GetWidth();
|
||
pCellStyle->m_oPadding = arSelectors.back().m_pCompiledStyle->m_oPadding;
|
||
pCellStyle->m_oBorder = arSelectors.back().m_pCompiledStyle->m_oBorder;
|
||
|
||
if (pCellStyle->m_wsHAlign.empty())
|
||
pCellStyle->m_wsHAlign = arSelectors.back().m_pCompiledStyle->m_oText.GetAlign().ToWString();
|
||
}
|
||
|
||
struct TRowspanElement
|
||
{
|
||
UINT m_unRowSpan;
|
||
UINT m_unColumnIndex;
|
||
const CStorageTableCell* m_pCell;
|
||
|
||
TRowspanElement(UINT unRowSpan, UINT unColumnIndex, const CStorageTableCell* pCell)
|
||
: m_unRowSpan(unRowSpan), m_unColumnIndex(unColumnIndex), m_pCell(pCell)
|
||
{}
|
||
};
|
||
|
||
void CHTMLReader::ReadTableRows(CStorageTable& oTable, std::vector<NSCSS::CNode>& arSelectors, ERowParseMode eMode)
|
||
{
|
||
if (nullptr == m_pWriter)
|
||
return;
|
||
|
||
std::vector<TRowspanElement> arRowspanElements;
|
||
std::vector<CStorageTableRow*> arRows;
|
||
|
||
int nDeath = m_oLightReader.GetDepth();
|
||
while (m_oLightReader.ReadNextSiblingNode(nDeath))
|
||
{
|
||
if (L"tr" != m_oLightReader.GetName())
|
||
continue;
|
||
|
||
GetSubClass(arSelectors);
|
||
|
||
CStorageTableRow *pRow = new CStorageTableRow();
|
||
|
||
for (std::vector<TRowspanElement>::iterator itElement = arRowspanElements.begin(); itElement < arRowspanElements.end();)
|
||
{
|
||
pRow->InsertCell(CStorageTableCell::CreateEmpty(itElement->m_pCell->GetColspan(), true, itElement->m_pCell->GetStyles()), itElement->m_unColumnIndex);
|
||
|
||
itElement->m_unRowSpan--;
|
||
if (1 == itElement->m_unRowSpan)
|
||
itElement = arRowspanElements.erase(itElement);
|
||
else
|
||
++itElement;
|
||
}
|
||
|
||
UINT unColumnIndex = 0;
|
||
int nTrDepth = m_oLightReader.GetDepth();
|
||
while (m_oLightReader.ReadNextSiblingNode(nTrDepth))
|
||
{
|
||
CStorageTableCell *pCell = new CStorageTableCell();
|
||
|
||
if (NULL == pCell)
|
||
continue;
|
||
|
||
GetSubClass(arSelectors);
|
||
|
||
std::vector<NSCSS::CNode> arNewSelectors{(std::vector<NSCSS::CNode>::const_iterator)std::find_if(arSelectors.begin(), arSelectors.end(), [](const NSCSS::CNode& oNode){ return L"table" == oNode.m_wsName; }), arSelectors.cend()};
|
||
|
||
CalculateCellStyles(pCell->GetStyles(), arNewSelectors);
|
||
|
||
std::wstring wsValue;
|
||
|
||
if (arSelectors.back().GetAttributeValue(L"colspan", wsValue))
|
||
pCell->SetColspan(NSStringFinder::ToInt(wsValue, 1), pRow->GetIndex());
|
||
|
||
if (arSelectors.back().GetAttributeValue(L"rowspan", wsValue))
|
||
{
|
||
pCell->SetRowspan(NSStringFinder::ToInt(wsValue, 1));
|
||
|
||
if (1 != pCell->GetRowspan())
|
||
arRowspanElements.push_back({pCell->GetRowspan(), unColumnIndex, pCell});
|
||
}
|
||
|
||
// Читаем th. Ячейка заголовка таблицы. Выравнивание посередине. Выделяется полужирным
|
||
if(m_oLightReader.GetName() == L"th")
|
||
{
|
||
if (pCell->GetStyles()->m_wsHAlign.empty())
|
||
arSelectors.back().m_pCompiledStyle->m_oText.SetAlign(L"center", arSelectors.size());
|
||
|
||
arSelectors.back().m_pCompiledStyle->m_oFont.SetWeight(L"bold", arSelectors.size());
|
||
|
||
m_pWriter->SetDataOutput(pCell->GetData());
|
||
ReadStream(arSelectors, true);
|
||
m_pWriter->RevertDataOutput();
|
||
}
|
||
// Читаем td. Ячейка таблицы
|
||
else if(m_oLightReader.GetName() == L"td")
|
||
{
|
||
m_pWriter->SetDataOutput(pCell->GetData());
|
||
ReadStream(arSelectors, true);
|
||
m_pWriter->RevertDataOutput();
|
||
}
|
||
|
||
if (pRow->GetIndex() == MAXCOLUMNSINTABLE - 1)
|
||
{
|
||
while (m_oLightReader.ReadNextSiblingNode(nTrDepth))
|
||
{
|
||
if (L"td" != m_oLightReader.GetName() && L"th" != m_oLightReader.GetName())
|
||
continue;
|
||
|
||
GetSubClass(arSelectors);
|
||
m_pWriter->SetDataOutput(pCell->GetData());
|
||
ReadStream(arSelectors);
|
||
m_pWriter->RevertDataOutput();
|
||
arSelectors.pop_back();
|
||
}
|
||
}
|
||
|
||
pRow->AddCell(pCell);
|
||
arSelectors.pop_back();
|
||
|
||
++unColumnIndex;
|
||
|
||
if (pRow->GetIndex() == MAXCOLUMNSINTABLE)
|
||
break;
|
||
}
|
||
|
||
arSelectors.pop_back();
|
||
arRows.push_back(pRow);
|
||
}
|
||
|
||
oTable.AddRows(arRows, eMode);
|
||
}
|
||
|
||
void CHTMLReader::ReadTableColspan(CStorageTable& oTable)
|
||
{
|
||
std::vector<NSCSS::CNode> arNodes;
|
||
GetSubClass(arNodes);
|
||
|
||
CTableColgroup *pColgroup = new CTableColgroup(arNodes.back());
|
||
|
||
if (NULL == pColgroup)
|
||
return;
|
||
|
||
oTable.AddColgroup(pColgroup);
|
||
|
||
const int nDeath = m_oLightReader.GetDepth();
|
||
if (!m_oLightReader.IsEmptyNode() && m_oLightReader.ReadNextSiblingNode2(nDeath))
|
||
{
|
||
do
|
||
{
|
||
if (L"col" != m_oLightReader.GetName())
|
||
continue;
|
||
|
||
GetSubClass(arNodes);
|
||
|
||
CTableCol *pCol = new CTableCol(arNodes.back());
|
||
|
||
if (NULL == pCol)
|
||
{
|
||
arNodes.pop_back();
|
||
continue;
|
||
}
|
||
|
||
CalculateCellStyles(pCol->GetStyle(), arNodes);
|
||
arNodes.pop_back();
|
||
|
||
if (NULL == pCol)
|
||
continue;
|
||
|
||
pColgroup->AddCol(pCol);
|
||
} while(m_oLightReader.ReadNextSiblingNode2(nDeath));
|
||
}
|
||
|
||
if(pColgroup->Empty())
|
||
{
|
||
std::map<std::wstring, std::wstring>::const_iterator itFound = arNodes.begin()->m_mAttributes.find(L"span");
|
||
|
||
CTableCol *pCol = new CTableCol((arNodes.begin()->m_mAttributes.cend() != itFound) ? NSStringFinder::ToInt(itFound->second, 1) : 1);
|
||
|
||
if (NULL == pCol)
|
||
return;
|
||
|
||
CalculateCellStyles(pCol->GetStyle(), arNodes);
|
||
|
||
pColgroup->AddCol(pCol);
|
||
}
|
||
}
|
||
|
||
bool CHTMLReader::ReadEmptyTag(UINT unTag, const std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
if (!m_mTags[unTag]->Open(arSelectors))
|
||
return false;
|
||
|
||
m_mTags[unTag]->Close(arSelectors);
|
||
|
||
return true;
|
||
}
|
||
|
||
bool CHTMLReader::ReadDefaultTag(UINT unTag, std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
if (!m_mTags[unTag]->Open(arSelectors))
|
||
return false;
|
||
|
||
const bool bResult{ReadStream(arSelectors)};
|
||
|
||
m_mTags[unTag]->Close(arSelectors);
|
||
|
||
return bResult;
|
||
}
|
||
|
||
void CHTMLReader::GetSubClass(std::vector<NSCSS::CNode>& arSelectors)
|
||
{
|
||
NSCSS::CNode oNode;
|
||
|
||
oNode.m_wsName = m_oLightReader.GetName();
|
||
// Стиль по атрибуту
|
||
std::wstring wsAttributeName;
|
||
|
||
if (m_oLightReader.MoveToFirstAttribute())
|
||
{
|
||
do
|
||
{
|
||
wsAttributeName = m_oLightReader.GetName();
|
||
if(wsAttributeName == L"class")
|
||
oNode.m_wsClass = EncodeXmlString(m_oLightReader.GetText());
|
||
else if(wsAttributeName == L"id")
|
||
{
|
||
oNode.m_wsId = EncodeXmlString(m_oLightReader.GetText());
|
||
// WriteEmptyBookmark(oXml, oNode.m_wsId);
|
||
|
||
// if (!m_oStylesCalculator.HaveStylesById(oNode.m_wsId))
|
||
// oNode.m_wsId.clear();
|
||
}
|
||
else if(wsAttributeName == L"style")
|
||
oNode.m_wsStyle += m_oLightReader.GetText();
|
||
else
|
||
{
|
||
if (CheckArgumentMath(oNode.m_wsName, wsAttributeName))
|
||
oNode.m_mAttributes[wsAttributeName] = m_oLightReader.GetText();
|
||
}
|
||
}while(m_oLightReader.MoveToNextAttribute());
|
||
}
|
||
|
||
m_oLightReader.MoveToElement();
|
||
arSelectors.push_back(oNode);
|
||
|
||
m_oCSSCalculator.CalculateCompiledStyle(arSelectors);
|
||
}
|
||
|
||
inline std::wstring GetArgumentValue(XmlUtils::CXmlLiteReader& oLiteReader, const std::wstring& wsArgumentName, const std::wstring& wsDefaultValue)
|
||
{
|
||
if (!oLiteReader.MoveToFirstAttribute())
|
||
return wsDefaultValue;
|
||
|
||
std::wstring wsValue{wsDefaultValue};
|
||
|
||
do
|
||
{
|
||
if (wsArgumentName == oLiteReader.GetName())
|
||
{
|
||
wsValue = oLiteReader.GetText();
|
||
break;
|
||
}
|
||
} while (oLiteReader.MoveToNextAttribute());
|
||
|
||
oLiteReader.MoveToElement();
|
||
return wsValue;
|
||
}
|
||
|
||
// Так как CSS калькулятор не знает для какой ноды производится расчет стиля
|
||
// и не знает, что некоторые стили предназначены только определенной ноде,
|
||
// то проще пока обрабатывать это заранее
|
||
// ! Используется для стилей, заданных через аргументы !
|
||
inline bool CheckArgumentMath(const std::wstring& wsNodeName, const std::wstring& wsStyleName)
|
||
{
|
||
if (L"border" == wsStyleName && L"table" != wsNodeName)
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
|
||
inline HtmlTag GetHtmlTag(const std::wstring& wsStrTag)
|
||
{
|
||
std::map<std::wstring, HtmlTag>::const_iterator oFound = m_HTML_TAGS.find(wsStrTag);
|
||
|
||
if (oFound == m_HTML_TAGS.cend())
|
||
{
|
||
if (wsStrTag.length() > 3 && wsStrTag.compare(wsStrTag.length() - 3, 3, L"svg") == 0)
|
||
return HTML_TAG(SVG);
|
||
return UNKNOWN_TAG;
|
||
}
|
||
|
||
return oFound->second;
|
||
}
|
||
|
||
inline bool UnreadableNode(const std::wstring& wsNodeName)
|
||
{
|
||
return L"head" == wsNodeName || L"meta" == wsNodeName || L"style" == wsNodeName;
|
||
}
|
||
|
||
inline bool TagIsUnprocessed(const std::wstring& wsTagName)
|
||
{
|
||
return L"xml" == wsTagName;
|
||
}
|
||
}
|