Modification of the html converter

This commit is contained in:
Kirill Polyakov
2026-02-10 02:18:38 +03:00
parent 56b8e41875
commit 098d40d44f
15 changed files with 445 additions and 245 deletions

View File

@ -511,7 +511,6 @@ namespace NSCSS
m_mDefaultStyleData[L"i"] = new CElement(L"i", {{L"font-style", L"italic"}}); m_mDefaultStyleData[L"i"] = new CElement(L"i", {{L"font-style", L"italic"}});
m_mDefaultStyleData[L"code"] = new CElement(L"code", {{L"font-family", L"Courier New"}}); m_mDefaultStyleData[L"code"] = new CElement(L"code", {{L"font-family", L"Courier New"}});
m_mDefaultStyleData[L"kbd"] = new CElement(L"kbd", {{L"font-family", L"Courier New"}, m_mDefaultStyleData[L"kbd"] = new CElement(L"kbd", {{L"font-family", L"Courier New"},
{L"font-size", L"20pt"},
{L"font_weight", L"bold"}}); {L"font_weight", L"bold"}});
m_mDefaultStyleData[L"s"] = new CElement(L"s", {{L"text-decoration", L"line-through"}}); m_mDefaultStyleData[L"s"] = new CElement(L"s", {{L"text-decoration", L"line-through"}});
m_mDefaultStyleData[L"u"] = new CElement(L"u", {{L"text-decoration", L"underline"}}); m_mDefaultStyleData[L"u"] = new CElement(L"u", {{L"text-decoration", L"underline"}});
@ -520,7 +519,6 @@ namespace NSCSS
m_mDefaultStyleData[L"sub"] = new CElement(L"sub", {{L"vertical-align", L"bottom"}}); m_mDefaultStyleData[L"sub"] = new CElement(L"sub", {{L"vertical-align", L"bottom"}});
m_mDefaultStyleData[L"dd"] = new CElement(L"dd", {{L"margin-left", L"720tw"}}); m_mDefaultStyleData[L"dd"] = new CElement(L"dd", {{L"margin-left", L"720tw"}});
m_mDefaultStyleData[L"pre"] = new CElement(L"pre", {{L"font-family", L"Courier New"}, m_mDefaultStyleData[L"pre"] = new CElement(L"pre", {{L"font-family", L"Courier New"},
{L"font-size", L"20pt"},
{L"margin-top", L"0"}, {L"margin-top", L"0"},
{L"margin-bottom", L"0"}}); {L"margin-bottom", L"0"}});
m_mDefaultStyleData[L"blockquote"] = new CElement(L"blockquote", {{L"margin", L"0px"}}); m_mDefaultStyleData[L"blockquote"] = new CElement(L"blockquote", {{L"margin", L"0px"}});

View File

@ -142,7 +142,7 @@ HRESULT CEpubFile::Convert(const std::wstring& sInputFile, const std::wstring& s
*/ */
CHtmlFile2 oFile; CHtmlFile2 oFile;
CHtmlParams oFileParams; HTML::THTMLParameters oFileParams;
oFileParams.SetAuthors (m_oBookInfo.GetCreators()); oFileParams.SetAuthors (m_oBookInfo.GetCreators());
oFileParams.SetGenres (m_oBookInfo.GetSubjects()); oFileParams.SetGenres (m_oBookInfo.GetSubjects());
@ -155,7 +155,7 @@ HRESULT CEpubFile::Convert(const std::wstring& sInputFile, const std::wstring& s
std::wstring sDocxFileTempDir = m_sTempDir + L"/tmp"; std::wstring sDocxFileTempDir = m_sTempDir + L"/tmp";
NSDirectory::CreateDirectory(sDocxFileTempDir); NSDirectory::CreateDirectory(sDocxFileTempDir);
oFile.SetTmpDirectory(sDocxFileTempDir); oFile.SetTempDirectory(sDocxFileTempDir);
oFile.SetCoreDirectory(NSFile::GetDirectoryName(sContent)); oFile.SetCoreDirectory(NSFile::GetDirectoryName(sContent));
std::vector<std::wstring> arFiles; std::vector<std::wstring> arFiles;
@ -182,7 +182,7 @@ HRESULT CEpubFile::Convert(const std::wstring& sInputFile, const std::wstring& s
sOutputDir = sOutputFile; sOutputDir = sOutputFile;
NSDirectory::CreateDirectory(sOutputDir); NSDirectory::CreateDirectory(sOutputDir);
HRESULT hRes = oFile.OpenBatchHtml(arFiles, sOutputDir, &oFileParams); HRESULT hRes = oFile.ConvertHTML2OOXML(arFiles, sOutputDir, &oFileParams);
if (bIsOutCompress && S_OK == hRes) if (bIsOutCompress && S_OK == hRes)
hRes = oOfficeUtils.CompressFileOrDirectory(sOutputDir, sOutputFile); hRes = oOfficeUtils.CompressFileOrDirectory(sOutputDir, sOutputFile);

View File

@ -2170,7 +2170,7 @@ HRESULT CFb2File::FromHtml(const std::wstring& sHtmlFile, const std::wstring& sD
RELEASEARRAYOBJECTS(pData); RELEASEARRAYOBJECTS(pData);
//XmlUtils::CXmlLiteReader oIndexHtml; //XmlUtils::CXmlLiteReader oIndexHtml;
std::wstring xhtml = htmlToXhtml(sContent, bNeedConvert); std::wstring xhtml = HTML::htmlToXhtml(sContent, bNeedConvert);
if (!m_internal->m_oLightReader.FromString(xhtml)) if (!m_internal->m_oLightReader.FromString(xhtml))
return S_FALSE; return S_FALSE;

View File

@ -162,6 +162,96 @@ const static std::map<std::wstring, HtmlTag> m_HTML_TAGS
ADD_TAG(L"svg", SVG) ADD_TAG(L"svg", SVG)
}; };
bool HTML2XHTML(const std::wstring& wsFileName, XmlUtils::CXmlLiteReader& oLiteReader)
{
BYTE* pData;
DWORD nLength;
if (!NSFile::CFileBinary::ReadAllBytes(wsFileName, &pData, nLength))
return false;
std::string sFileContent = XmlUtils::GetUtf8FromFileContent(pData, nLength);
bool bNeedConvert = true;
if (nLength > 4)
{
if (pData[0] == 0xFF && pData[1] == 0xFE && !(pData[2] == 0x00 && pData[3] == 0x00))
bNeedConvert = false;
if (pData[0] == 0xFE && pData[1] == 0xFF)
bNeedConvert = false;
if (pData[0] == 0xFF && pData[1] == 0xFE && pData[2] == 0x00 && pData[3] == 0x00)
bNeedConvert = false;
if (pData[0] == 0 && pData[1] == 0 && pData[2] == 0xFE && pData[3] == 0xFF)
bNeedConvert = false;
}
RELEASEARRAYOBJECTS(pData);
size_t nFind = sFileContent.find("version=\"");
if(nFind != std::string::npos)
{
nFind += 9;
size_t nFindEnd = sFileContent.find("\"", nFind);
if(nFindEnd != std::string::npos)
sFileContent.replace(nFind, nFindEnd - nFind, "1.0");
}
const std::wstring sRes{htmlToXhtml(sFileContent, bNeedConvert)};
#ifdef SAVE_NORMALIZED_HTML
#if 1 == SAVE_NORMALIZED_HTML
NSFile::CFileBinary oWriter;
if (oWriter.CreateFileW(L"res.html"))
{
oWriter.WriteStringUTF8(sRes);
oWriter.CloseFile();
}
#endif
#endif
return oLiteReader.FromString(sRes);
}
bool MHT2XHTML(const std::wstring& wsFileName, XmlUtils::CXmlLiteReader& oLiteReader)
{
NSFile::CFileBinary file;
if (!file.OpenFile(wsFileName))
return false;
unsigned char* buffer = new unsigned char[4096];
if (!buffer)
{
file.CloseFile();
return false;
}
DWORD dwReadBytes = 0;
file.ReadFile(buffer, 4096, dwReadBytes);
file.CloseFile();
std::string xml_string = XmlUtils::GetUtf8FromFileContent(buffer, dwReadBytes);
const std::string sContentType = NSStringFinder::FindProperty(xml_string, "content-type", ":", ";");
bool bRes = false;
if(NSStringFinder::Equals(sContentType, "multipart/related"))
{
BYTE* pData;
DWORD nLength;
if (!NSFile::CFileBinary::ReadAllBytes(wsFileName, &pData, nLength))
return false;
std::string sFileContent = XmlUtils::GetUtf8FromFileContent(pData, nLength);
RELEASEARRAYOBJECTS(pData);
const std::wstring sRes = mhtToXhtml(sFileContent);
bRes = oLiteReader.FromString(sRes);
}
else
bRes = HTML2XHTML(wsFileName, oLiteReader);
RELEASEARRAYOBJECTS(buffer);
return bRes;
}
inline std::wstring GetArgumentValue(XmlUtils::CXmlLiteReader& oLiteReader, const std::wstring& wsArgumentName, const std::wstring& wsDefaultValue = L""); inline std::wstring GetArgumentValue(XmlUtils::CXmlLiteReader& oLiteReader, const std::wstring& wsArgumentName, const std::wstring& wsDefaultValue = L"");
inline bool CheckArgumentMath(const std::wstring& wsNodeName, const std::wstring& wsStyleName); inline bool CheckArgumentMath(const std::wstring& wsNodeName, const std::wstring& wsStyleName);
inline HtmlTag GetHtmlTag(const std::wstring& wsStrTag); inline HtmlTag GetHtmlTag(const std::wstring& wsStrTag);
@ -190,17 +280,42 @@ void CHTMLReader::SetCoreDirectory(const std::wstring& wsPath)
HRESULT CHTMLReader::ConvertHTML2OOXML(const std::wstring& wsPath, const std::wstring& wsDirectory, THTMLParameters* pParameters) HRESULT CHTMLReader::ConvertHTML2OOXML(const std::wstring& wsPath, const std::wstring& wsDirectory, THTMLParameters* pParameters)
{ {
InitOOXMLTags(pParameters); return InitAndConvert2OOXML({wsPath}, wsDirectory, HTML2XHTML, pParameters);
m_wsDstDirectory = wsDirectory;
return ConvertHTML(wsPath, wsDirectory);
} }
HRESULT CHTMLReader::ConvertHTML2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters) HRESULT CHTMLReader::ConvertHTML2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters)
{ {
InitMDTags(); return InitAndConvert2Markdown({wsPath}, wsFinalFile, HTML2XHTML, pParameters);
return ConvertHTML(wsPath, wsFinalFile); }
HRESULT CHTMLReader::ConvertHTML2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, THTMLParameters* pParameters)
{
return InitAndConvert2OOXML(arPaths, wsDirectory, HTML2XHTML, pParameters);
}
HRESULT CHTMLReader::ConvertHTML2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters)
{
return InitAndConvert2Markdown(arPaths, wsFinalFile, HTML2XHTML, pParameters);
}
HRESULT CHTMLReader::ConvertMHT2OOXML(const std::wstring& wsPath, const std::wstring& wsDirectory, THTMLParameters* pParameters)
{
return InitAndConvert2OOXML({wsPath}, wsDirectory, MHT2XHTML, pParameters);
}
HRESULT CHTMLReader::ConvertMHT2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters)
{
return InitAndConvert2Markdown({wsPath}, wsFinalFile, MHT2XHTML, pParameters);
}
HRESULT CHTMLReader::ConvertMHT2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, THTMLParameters* pParameters)
{
return InitAndConvert2OOXML(arPaths, wsDirectory, MHT2XHTML, pParameters);
}
HRESULT CHTMLReader::ConvertMHT2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters)
{
return InitAndConvert2Markdown(arPaths, wsFinalFile, MHT2XHTML, pParameters);
} }
void CHTMLReader::Clear() void CHTMLReader::Clear()
@ -221,14 +336,11 @@ void CHTMLReader::InitOOXMLTags(THTMLParameters* pParametrs)
{ {
Clear(); Clear();
COOXMLWriter *pWriter = new COOXMLWriter(); COOXMLWriter *pWriter = new COOXMLWriter(pParametrs, &m_oCSSCalculator);
if (nullptr == pWriter) if (nullptr == pWriter)
return; return;
pWriter->SetCSSCalculator(&m_oCSSCalculator);
pWriter->SetHTMLParameters(pParametrs);
pWriter->SetSrcDirectory (m_wsSrcDirectory); pWriter->SetSrcDirectory (m_wsSrcDirectory);
pWriter->SetDstDirectory (m_wsDstDirectory); pWriter->SetDstDirectory (m_wsDstDirectory);
pWriter->SetTempDirectory(m_wsTempDirectory); pWriter->SetTempDirectory(m_wsTempDirectory);
@ -270,12 +382,12 @@ void CHTMLReader::InitOOXMLTags(THTMLParameters* pParametrs)
m_mTags[HTML_TAG(BDO)] = oIgnoredTag; m_mTags[HTML_TAG(BDO)] = oIgnoredTag;
m_mTags[HTML_TAG(SPAN)] = oIgnoredTag; m_mTags[HTML_TAG(SPAN)] = oIgnoredTag;
m_mTags[HTML_TAG(H1)] = oIgnoredTag; m_mTags[HTML_TAG(H1)] = oIgnoredTag;
m_mTags[HTML_TAG(CODE)] = oIgnoredTag; m_mTags[HTML_TAG(CODE)] = oIgnoredTag;
} }
void CHTMLReader::InitMDTags() void CHTMLReader::InitMDTags(TMarkdownParameters* pParametrs)
{ {
CMDWriter *pWriter = new CMDWriter({}); CMDWriter *pWriter = new CMDWriter((nullptr != pParametrs) ? *pParametrs : TMarkdownParameters{});
if (nullptr == pWriter) if (nullptr == pWriter)
return; return;
@ -326,65 +438,56 @@ bool CHTMLReader::IsHTML()
return ((m_oLightReader.MoveToStart() && m_oLightReader.ReadNextNode()) ? m_oLightReader.GetName() == L"html" : false); return ((m_oLightReader.MoveToStart() && m_oLightReader.ReadNextNode()) ? m_oLightReader.GetName() == L"html" : false);
} }
bool CHTMLReader::HTML2XHTML(const std::wstring& wsFileName) HRESULT CHTMLReader::InitAndConvert2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, Convert_Func Convertation, THTMLParameters* pParameters)
{ {
BYTE* pData; InitOOXMLTags(pParameters);
DWORD nLength; m_wsDstDirectory = wsDirectory;
if (!NSFile::CFileBinary::ReadAllBytes(wsFileName, &pData, nLength))
return false;
std::string sFileContent = XmlUtils::GetUtf8FromFileContent(pData, nLength); HRESULT lResult{S_FALSE};
bool bNeedConvert = true;
if (nLength > 4)
{
if (pData[0] == 0xFF && pData[1] == 0xFE && !(pData[2] == 0x00 && pData[3] == 0x00))
bNeedConvert = false;
if (pData[0] == 0xFE && pData[1] == 0xFF)
bNeedConvert = false;
if (pData[0] == 0xFF && pData[1] == 0xFE && pData[2] == 0x00 && pData[3] == 0x00)
bNeedConvert = false;
if (pData[0] == 0 && pData[1] == 0 && pData[2] == 0xFE && pData[3] == 0xFF)
bNeedConvert = false;
}
RELEASEARRAYOBJECTS(pData);
size_t nFind = sFileContent.find("version=\"");
if(nFind != std::string::npos)
{
nFind += 9;
size_t nFindEnd = sFileContent.find("\"", nFind);
if(nFindEnd != std::string::npos)
sFileContent.replace(nFind, nFindEnd - nFind, "1.0");
}
const std::wstring sRes{htmlToXhtml(sFileContent, bNeedConvert)};
#ifdef SAVE_NORMALIZED_HTML
#if 1 == SAVE_NORMALIZED_HTML
NSFile::CFileBinary oWriter;
if (oWriter.CreateFileW(m_sTmp + L"/res.html"))
{
oWriter.WriteStringUTF8(sRes);
oWriter.CloseFile();
}
#endif
#endif
return m_oLightReader.FromString(sRes);
}
HRESULT CHTMLReader::ConvertHTML(const std::wstring& wsPath, const std::wstring& wsDirectory)
{
if (nullptr == m_pWriter || !HTML2XHTML(wsPath) || !m_oLightReader.IsValid() || !IsHTML())
return S_FALSE;
m_pWriter->Begin(wsDirectory); m_pWriter->Begin(wsDirectory);
for (const std::wstring& wsPath : arPaths)
{
if (Convert(wsPath, Convertation))
{
lResult = S_OK;
if (nullptr != pParameters && pParameters->m_bNeedPageBreakBefore)
m_pWriter->PageBreak();
}
}
m_pWriter->End(wsDirectory);
return lResult;
}
HRESULT CHTMLReader::InitAndConvert2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, Convert_Func Convertation, TMarkdownParameters* pParameters)
{
InitMDTags(pParameters);
HRESULT lResult{S_FALSE};
m_pWriter->Begin(L"");
for (const std::wstring& wsPath : arPaths)
{
if (Convert(wsPath, Convertation))
lResult = S_OK;
}
m_pWriter->End(wsFinalFile);
return lResult;
}
bool CHTMLReader::Convert(const std::wstring& wsPath, Convert_Func Convertation)
{
if (nullptr == m_pWriter || !Convertation(wsPath, m_oLightReader) || !m_oLightReader.IsValid() || !IsHTML())
return false;
m_wsSrcDirectory = NSSystemPath::GetDirectoryName(wsPath); m_wsSrcDirectory = NSSystemPath::GetDirectoryName(wsPath);
// m_sDst = sDst;
m_oLightReader.MoveToStart(); m_oLightReader.MoveToStart();
m_oLightReader.ReadNextNode(); m_oLightReader.ReadNextNode();
@ -394,13 +497,9 @@ HRESULT CHTMLReader::ConvertHTML(const std::wstring& wsPath, const std::wstring&
if(!m_oLightReader.MoveToStart()) if(!m_oLightReader.MoveToStart())
return S_FALSE; return S_FALSE;
// if(oParams && oParams->m_bNeedPageBreakBefore)
// m_internal->PageBreakBefore();
ReadDocument(); ReadDocument();
m_pWriter->End(wsDirectory); return true;
return S_OK;
} }
void CHTMLReader::ReadStyle() void CHTMLReader::ReadStyle()
@ -438,16 +537,16 @@ void CHTMLReader::ReadStyle()
void CHTMLReader::ReadStyle2() void CHTMLReader::ReadStyle2()
{ {
std::wstring sName = m_oLightReader.GetName(); const std::wstring wsName = m_oLightReader.GetName();
// Стиль по ссылке // Стиль по ссылке
if(sName == L"link") if(wsName == L"link")
{ {
while(m_oLightReader.MoveToNextAttribute()) while(m_oLightReader.MoveToNextAttribute())
ReadStyleFromNetwork(); ReadStyleFromNetwork();
m_oLightReader.MoveToElement(); m_oLightReader.MoveToElement();
} }
// тэг style содержит стили для styles.xml // тэг style содержит стили для styles.xml
else if(sName == L"style") else if(wsName == L"style")
m_oCSSCalculator.AddStyles(m_oLightReader.GetText2()); m_oCSSCalculator.AddStyles(m_oLightReader.GetText2());
const int nDeath = m_oLightReader.GetDepth(); const int nDeath = m_oLightReader.GetDepth();
@ -492,10 +591,10 @@ void CHTMLReader::ReadDocument()
int nDeath = m_oLightReader.GetDepth(); int nDeath = m_oLightReader.GetDepth();
while(m_oLightReader.ReadNextSiblingNode(nDeath)) while(m_oLightReader.ReadNextSiblingNode(nDeath))
{ {
std::wstring sName = m_oLightReader.GetName(); const std::wstring wsName = m_oLightReader.GetName();
if(sName == L"head") if(wsName == L"head")
ReadHead(); ReadHead();
else if(sName == L"body") else if(wsName == L"body")
ReadBody(); ReadBody();
} }
} }
@ -524,12 +623,11 @@ void CHTMLReader::ReadBody()
GetSubClass(arSelectors); GetSubClass(arSelectors);
/* if (!arSelectors.back().m_mAttributes.empty())
if (!sSelectors.back().m_mAttributes.empty())
{ {
std::map<std::wstring, std::wstring>::iterator itFound = sSelectors.back().m_mAttributes.find(L"bgcolor"); std::map<std::wstring, std::wstring>::iterator itFound = arSelectors.back().m_mAttributes.find(L"bgcolor");
if (sSelectors.back().m_mAttributes.end() != itFound) if (arSelectors.back().m_mAttributes.end() != itFound)
{ {
NSCSS::NSProperties::CColor oColor; NSCSS::NSProperties::CColor oColor;
oColor.SetValue(itFound->second); oColor.SetValue(itFound->second);
@ -539,15 +637,14 @@ void CHTMLReader::ReadBody()
const std::wstring wsHEXColor{oColor.ToHEX()}; const std::wstring wsHEXColor{oColor.ToHEX()};
if (!wsHEXColor.empty()) if (!wsHEXColor.empty())
m_oDocXml.WriteString(L"<w:background w:color=\"" + wsHEXColor + L"\"/>"); m_pWriter->GetCurrentDocument()->WriteString(L"<w:background w:color=\"" + wsHEXColor + L"\"/>");
sSelectors.back().m_mAttributes.erase(itFound); arSelectors.back().m_mAttributes.erase(itFound);
} }
} }
} }
m_oLightReader.MoveToElement(); m_oLightReader.MoveToElement();
*/
ReadStream(arSelectors); ReadStream(arSelectors);
} }
@ -557,42 +654,24 @@ bool CHTMLReader::ReadStream(std::vector<NSCSS::CNode>& arSelectors, bool bInser
if (nullptr == m_pWriter) if (nullptr == m_pWriter)
return false; return false;
const int nDepth{m_oLightReader.GetDepth()}; bool bResult{false};
bool bResult = false;
XmlUtils::XmlNodeType eNodeType = XmlUtils::XmlNodeType_EndElement;
while (m_oLightReader.Read(eNodeType) && m_oLightReader.GetDepth() >= nDepth && XmlUtils::XmlNodeType_EndElement != eNodeType) const int nDeath = m_oLightReader.GetDepth();
if(m_oLightReader.IsEmptyNode() || !m_oLightReader.ReadNextSiblingNode2(nDeath))
{ {
if (eNodeType == XmlUtils::XmlNodeType_Text || if (!bInsertEmptyP)
eNodeType == XmlUtils::XmlNodeType_Whitespace || return false;
eNodeType == XmlUtils::XmlNodeType_SIGNIFICANT_WHITESPACE ||
eNodeType == XmlUtils::XmlNodeType_CDATA)
{
const char* pValue = m_oLightReader.GetTextChar();
if('\0' != pValue[0]) m_pWriter->WriteEmptyParagraph();
{ return true;
std::wstring wsText;
NSFile::CUtf8Converter::GetUnicodeStringFromUTF8((BYTE*)pValue, (LONG)strlen(pValue), wsText);
if (wsText.empty())
continue;
arSelectors.push_back(NSCSS::CNode{L"#text", L"", L""});
m_oCSSCalculator.CalculateCompiledStyle(arSelectors);
bResult = m_pWriter->WriteText(wsText, arSelectors);
arSelectors.pop_back();
}
}
else if (eNodeType == XmlUtils::XmlNodeType_Element)
{
if (ReadInside(arSelectors))
bResult = true;
}
} }
do
{
if (ReadInside(arSelectors))
bResult = true;
} while(m_oLightReader.ReadNextSiblingNode2(nDeath));
if (!bResult && bInsertEmptyP) if (!bResult && bInsertEmptyP)
m_pWriter->WriteEmptyParagraph(); m_pWriter->WriteEmptyParagraph();
@ -603,10 +682,10 @@ bool CHTMLReader::ReadInside(std::vector<NSCSS::CNode>& arSelectors)
{ {
const std::wstring wsName{m_oLightReader.GetName()}; const std::wstring wsName{m_oLightReader.GetName()};
//TODO:: обработать все варианты return'а
if(wsName == L"#text") if(wsName == L"#text")
return ReadText(arSelectors); return ReadText(arSelectors);
//TODO:: обработать все варианты return'а
if (UnreadableNode(wsName) || TagIsUnprocessed(wsName)) if (UnreadableNode(wsName) || TagIsUnprocessed(wsName))
return false; return false;
@ -621,7 +700,7 @@ bool CHTMLReader::ReadInside(std::vector<NSCSS::CNode>& arSelectors)
case HTML_TAG(A): case HTML_TAG(A):
case HTML_TAG(AREA): case HTML_TAG(AREA):
{ {
bResult = ReadAnchor(arSelectors); bResult = ReadDefaultTag(HTML_TAG(A), arSelectors);
break; break;
} }
case HTML_TAG(ABBR): case HTML_TAG(ABBR):
@ -892,15 +971,6 @@ bool CHTMLReader::ReadInside(std::vector<NSCSS::CNode>& arSelectors)
} }
} }
// if (HTML_TAG(DIV) != eHtmlTag && HTML_TAG(ASIDE) != eHtmlTag)
// {
// if (bResult)
// m_oState.m_eLastElement = eHtmlTag;
// m_oState.m_bBanUpdatePageData = true;
// }
// readNote(oXml, sSelectors, sNote);
arSelectors.pop_back(); arSelectors.pop_back();
return bResult; return bResult;
} }
@ -919,25 +989,12 @@ bool CHTMLReader::ReadText(std::vector<NSCSS::CNode>& arSelectors)
return bResult; return bResult;
} }
bool CHTMLReader::ReadAnchor(std::vector<NSCSS::CNode>& arSelectors)
{
if (nullptr == m_pWriter || !m_mTags[HTML_TAG(A)]->Open(arSelectors))
return false;
if (!ReadStream(arSelectors))
m_pWriter->WriteEmptyParagraph(true);
m_mTags[HTML_TAG(A)]->Close(arSelectors);
return true;
}
bool CHTMLReader::ReadSVG(const std::vector<NSCSS::CNode>& arSelectors) bool CHTMLReader::ReadSVG(const std::vector<NSCSS::CNode>& arSelectors)
{ {
if (!m_mTags[HTML_TAG(IMAGE)]->Open(arSelectors, m_oLightReader.GetOuterXml())) if (!m_mTags[HTML_TAG(IMG)]->Open(arSelectors, m_oLightReader.GetOuterXml()))
return false; return false;
m_mTags[HTML_TAG(IMAGE)]->Close(arSelectors); m_mTags[HTML_TAG(IMG)]->Close(arSelectors);
return true; return true;
} }

View File

@ -37,20 +37,29 @@ public:
HRESULT ConvertHTML2OOXML (const std::wstring& wsPath, const std::wstring& wsDirectory, THTMLParameters* pParameters = nullptr); HRESULT ConvertHTML2OOXML (const std::wstring& wsPath, const std::wstring& wsDirectory, THTMLParameters* pParameters = nullptr);
HRESULT ConvertHTML2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters = nullptr); HRESULT ConvertHTML2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters = nullptr);
HRESULT ConvertMHT2OOXML (const std::wstring& sPath, const std::wstring& sDirectory, THTMLParameters* pParameters = nullptr); HRESULT ConvertHTML2OOXML (const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, THTMLParameters* pParameters = nullptr);
HRESULT ConvertMHT2Markdown (const std::wstring& sPath, const std::wstring& sDirectory, TMarkdownParameters* pParameters = nullptr); HRESULT ConvertHTML2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters = nullptr);
HRESULT ConvertMHT2OOXML (const std::wstring& wsPath, const std::wstring& wsDirectory, THTMLParameters* pParameters = nullptr);
HRESULT ConvertMHT2Markdown (const std::wstring& wsPath, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters = nullptr);
HRESULT ConvertMHT2OOXML (const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, THTMLParameters* pParameters = nullptr);
HRESULT ConvertMHT2Markdown (const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, TMarkdownParameters* pParameters = nullptr);
NSCSS::CCssCalculator* GetCSSCalculator(); NSCSS::CCssCalculator* GetCSSCalculator();
private: private:
void Clear(); void Clear();
void InitOOXMLTags(THTMLParameters* pParametrs = nullptr); void InitOOXMLTags(THTMLParameters* pParametrs = nullptr);
void InitMDTags(); void InitMDTags(TMarkdownParameters* pParametrs = nullptr);
bool IsHTML(); bool IsHTML();
bool HTML2XHTML(const std::wstring& wsFileName); typedef std::function<bool(const std::wstring&, XmlUtils::CXmlLiteReader&)> Convert_Func;
HRESULT ConvertHTML(const std::wstring& wsPath, const std::wstring& wsDirectory); HRESULT InitAndConvert2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, Convert_Func Convertation, THTMLParameters* pParameters = nullptr);
HRESULT InitAndConvert2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, Convert_Func Convertation, TMarkdownParameters* pParameters = nullptr);
bool Convert(const std::wstring& wsPath, Convert_Func Convertation);
void ReadStyle(); void ReadStyle();
void ReadStyle2(); void ReadStyle2();
@ -65,7 +74,6 @@ private:
bool ReadText(std::vector<NSCSS::CNode>& arSelectors); bool ReadText(std::vector<NSCSS::CNode>& arSelectors);
bool ReadAnchor(std::vector<NSCSS::CNode>& arSelectors);
bool ReadSVG(const std::vector<NSCSS::CNode>& arSelectors); bool ReadSVG(const std::vector<NSCSS::CNode>& arSelectors);
bool ReadEmptyTag(UINT unTag, const std::vector<NSCSS::CNode>& arSelectors); bool ReadEmptyTag(UINT unTag, const std::vector<NSCSS::CNode>& arSelectors);
bool ReadDefaultTag(UINT unTag, std::vector<NSCSS::CNode>& arSelectors); bool ReadDefaultTag(UINT unTag, std::vector<NSCSS::CNode>& arSelectors);

View File

@ -49,34 +49,10 @@ bool CAnchor<COOXMLWriter>::Open(const std::vector<NSCSS::CNode>& arSelectors, c
bCross = true; bCross = true;
if (arSelectors.back().GetAttributeValue(L"name", wsName)) if (arSelectors.back().GetAttributeValue(L"name", wsName))
m_pWriter->WriteBookmark(wsName); m_pWriter->WriteEmptyBookmark(wsName);
arSelectors.back().GetAttributeValue(L"alt", wsAlt); arSelectors.back().GetAttributeValue(L"alt", wsAlt);
if (!m_pWriter->OpenP())
m_pWriter->CloseR();
else
m_pWriter->WritePPr(arSelectors);
if (bCross)
m_pWriter->OpenCrossHyperlink(wsRef, arSelectors);
else
{
std::wstring wsTooltip(wsRef);
arSelectors.back().GetAttributeValue(L"title", wsTooltip);
m_pWriter->OpenExternalHyperlink(wsRef, wsTooltip, arSelectors);
}
return true;
}
void CAnchor<COOXMLWriter>::Close(const std::vector<NSCSS::CNode>& arSelectors)
{
if (!ValidWriter())
return;
bool bCross = false;
std::wstring wsFootnote; std::wstring wsFootnote;
if (arSelectors.back().m_wsStyle.find(L"mso-footnote-id") != std::wstring::npos) if (arSelectors.back().m_wsStyle.find(L"mso-footnote-id") != std::wstring::npos)
@ -87,23 +63,32 @@ void CAnchor<COOXMLWriter>::Close(const std::vector<NSCSS::CNode>& arSelectors)
wsFootnote = L"href"; wsFootnote = L"href";
} }
std::wstring wsRef; bool bFootnote = false;
if (arSelectors.size() > 1)
if (arSelectors.back().GetAttributeValue(L"href", wsRef))
{ {
if(wsRef.find('#') != std::wstring::npos) const NSCSS::CNode& oNode = arSelectors[arSelectors.size() - 2];
bCross = true; bFootnote = oNode.m_wsName == L"p" && oNode.m_wsClass == L"MsoFootnoteText";
} }
if (bCross) if (bCross)
{ m_pWriter->SetHyperlinkData(wsRef, L"", true, wsFootnote, bFootnote);
if (wsFootnote == L"href")
wsFootnote = wsRef.substr(wsRef.find('#') + 1);
m_pWriter->CloseCrossHyperlink(arSelectors, wsFootnote, wsRef);
}
else else
m_pWriter->CloseExternalHyperlink(); {
std::wstring wsTooltip(wsRef);
arSelectors.back().GetAttributeValue(L"title", wsTooltip);
m_pWriter->SetHyperlinkData(wsRef, wsTooltip, false, wsFootnote, bFootnote);
}
return true;
}
void CAnchor<COOXMLWriter>::Close(const std::vector<NSCSS::CNode>& arSelectors)
{
if (!ValidWriter())
return;
m_pWriter->ClearHyperlinkData();
} }
CAbbr<COOXMLWriter>::CAbbr(COOXMLWriter* pWriter) CAbbr<COOXMLWriter>::CAbbr(COOXMLWriter* pWriter)
@ -208,6 +193,7 @@ void CDivision<COOXMLWriter>::Close(const std::vector<NSCSS::CNode>& arSelectors
m_pWriter->RollBackState(); m_pWriter->RollBackState();
} }
m_pWriter->CloseP();
m_arFootnoteIDs.pop(); m_arFootnoteIDs.pop();
} }
@ -330,6 +316,12 @@ bool CImage<COOXMLWriter>::Open(const std::vector<NSCSS::CNode>& arSelectors, co
((!wsBasePath.empty() && wsBasePath.length() > 4 && wsBasePath.substr(0, 4) == L"http") || ((!wsBasePath.empty() && wsBasePath.length() > 4 && wsBasePath.substr(0, 4) == L"http") ||
(wsSrc.length() > 4 && wsSrc.substr(0, 4) == L"http"))) (wsSrc.length() > 4 && wsSrc.substr(0, 4) == L"http")))
{ {
if (!wsExtention.empty() && NotValidExtension(wsExtention))
{
m_pWriter->WriteAlternativeImage(wsAlt, wsSrc, oImageData);
return true;
}
const std::wstring wsDst = wsImagePath + L'.' + ((!wsExtention.empty()) ? wsExtention : L"png"); const std::wstring wsDst = wsImagePath + L'.' + ((!wsExtention.empty()) ? wsExtention : L"png");
// Проверка gc_allowNetworkRequest предполагается в kernel_network // Проверка gc_allowNetworkRequest предполагается в kernel_network
@ -404,11 +396,10 @@ bool CImage<COOXMLWriter>::Open(const std::vector<NSCSS::CNode>& arSelectors, co
m_pWriter->WriteAlternativeImage(wsAlt, wsSrc, oImageData); m_pWriter->WriteAlternativeImage(wsAlt, wsSrc, oImageData);
else else
{ {
m_arrImages.push_back(wsSrc);
m_pWriter->WritePPr(arSelectors); m_pWriter->WritePPr(arSelectors);
const std::wstring wsImageID{std::to_wstring(m_arrImages.size())}; const std::wstring wsImageID{std::to_wstring(m_arrImages.size())};
m_arrImages.push_back(wsSrc);
if (nImageId < 0) if (nImageId < 0)
{ {
@ -1256,6 +1247,7 @@ void CTable<COOXMLWriter>::Close(const std::vector<NSCSS::CNode>& arSelectors)
return; return;
m_pWriter->GetCurrentDocument()->WriteNodeEnd(L"w:tbl"); m_pWriter->GetCurrentDocument()->WriteNodeEnd(L"w:tbl");
m_pWriter->WriteEmptyParagraph(true);
} }
CTableRow<COOXMLWriter>::CTableRow(COOXMLWriter* pWriter) CTableRow<COOXMLWriter>::CTableRow(COOXMLWriter* pWriter)

View File

@ -19,6 +19,8 @@ public:
virtual void WriteEmptyParagraph(bool bVahish = false, bool bInP = false) = 0; virtual void WriteEmptyParagraph(bool bVahish = false, bool bInP = false) = 0;
virtual void PageBreak() = 0;
virtual void BeginBlock() = 0; virtual void BeginBlock() = 0;
virtual void EndBlock(bool bAddBlock) = 0; virtual void EndBlock(bool bAddBlock) = 0;

View File

@ -95,8 +95,10 @@ bool CMDWriter::WriteText(std::wstring wsText, const std::vector<NSCSS::CNode>&
} }
void CMDWriter::WriteEmptyParagraph(bool bVahish, bool bInP) void CMDWriter::WriteEmptyParagraph(bool bVahish, bool bInP)
{ {}
}
void CMDWriter::PageBreak()
{}
void CMDWriter::BeginBlock() void CMDWriter::BeginBlock()
{ {

View File

@ -43,6 +43,8 @@ public:
void WriteEmptyParagraph(bool bVahish = false, bool bInP = false) override; void WriteEmptyParagraph(bool bVahish = false, bool bInP = false) override;
void PageBreak() override;
void BeginBlock() override; void BeginBlock() override;
void EndBlock(bool bAddBlock) override; void EndBlock(bool bAddBlock) override;

View File

@ -40,11 +40,11 @@ inline UINT GetFontSizeLevel(UINT unFontSize);
inline UINT GetFontSizeByLevel(UINT unLevel); inline UINT GetFontSizeByLevel(UINT unLevel);
inline void ReplaceSpaces(std::wstring& wsValue); inline void ReplaceSpaces(std::wstring& wsValue);
COOXMLWriter::COOXMLWriter() COOXMLWriter::COOXMLWriter(THTMLParameters* pHTMLParameters, NSCSS::CCssCalculator* pCSSCalculator)
: m_pDstPath(nullptr), m_pTempDir(nullptr), m_pSrcPath(nullptr), : m_pDstPath(nullptr), m_pTempDir(nullptr), m_pSrcPath(nullptr),
m_pBasePath(nullptr), m_pCorePath(nullptr), m_pHTMLParameters(nullptr), m_pBasePath(nullptr), m_pCorePath(nullptr), m_pStylesCalculator(pCSSCalculator),
m_nFootnoteId(1), m_nHyperlinkId(1), m_nListId(1), m_nElementId(1), m_pHTMLParameters(pHTMLParameters), m_nFootnoteId(1), m_nHyperlinkId(1), m_nListId(1),
m_bBanUpdatePageData(false), m_bWasDivs(false), m_pFonts(nullptr) m_nElementId(1), m_bBanUpdatePageData(false), m_bWasDivs(false), m_pFonts(nullptr)
{ {
m_oPageData.SetWidth (DEFAULT_PAGE_WIDTH, NSCSS::UnitMeasure::Twips, 0, true); m_oPageData.SetWidth (DEFAULT_PAGE_WIDTH, NSCSS::UnitMeasure::Twips, 0, true);
m_oPageData.SetHeight(DEFAULT_PAGE_HEIGHT, NSCSS::UnitMeasure::Twips, 0, true); m_oPageData.SetHeight(DEFAULT_PAGE_HEIGHT, NSCSS::UnitMeasure::Twips, 0, true);
@ -56,16 +56,6 @@ COOXMLWriter::COOXMLWriter()
m_arStates.top().m_pCurrentDocument = &m_oDocXml; m_arStates.top().m_pCurrentDocument = &m_oDocXml;
} }
void COOXMLWriter::SetCSSCalculator(NSCSS::CCssCalculator* pCSSCalculator)
{
m_pStylesCalculator = pCSSCalculator;
}
void COOXMLWriter::SetHTMLParameters(THTMLParameters* pHTMLParameters)
{
m_pHTMLParameters = pHTMLParameters;
}
void COOXMLWriter::SetSrcDirectory(const std::wstring& wsPath) void COOXMLWriter::SetSrcDirectory(const std::wstring& wsPath)
{ {
m_pSrcPath = &wsPath; m_pSrcPath = &wsPath;
@ -401,6 +391,8 @@ bool COOXMLWriter::OpenR()
if (m_arStates.top().m_bInR) if (m_arStates.top().m_bInR)
return false; return false;
OpenHyperlink();
m_arStates.top().m_pCurrentDocument->WriteString(L"<w:r>"); m_arStates.top().m_pCurrentDocument->WriteString(L"<w:r>");
m_arStates.top().m_bInR = true; m_arStates.top().m_bInR = true;
return true; return true;
@ -416,6 +408,22 @@ bool COOXMLWriter::OpenT()
return true; return true;
} }
void COOXMLWriter::OpenHyperlink()
{
if (m_arStates.top().m_bInHyperlink)
return;
if (!m_arStates.top().m_wsHref.empty())
{
if (m_arStates.top().m_bISCrossHyperlink)
OpenCrossHyperlink(m_arStates.top().m_wsHref);
else
OpenExternalHyperlink(m_arStates.top().m_wsHref, m_arStates.top().m_wsTooltip);
m_arStates.top().m_bInHyperlink = true;
}
}
void COOXMLWriter::CloseP() void COOXMLWriter::CloseP()
{ {
m_arStates.top().m_bWasSpace = true; m_arStates.top().m_bWasSpace = true;
@ -425,6 +433,7 @@ void COOXMLWriter::CloseP()
CloseT(); CloseT();
CloseR(); CloseR();
CloseHyperlink();
m_arStates.top().m_pCurrentDocument->WriteString(L"</w:p>"); m_arStates.top().m_pCurrentDocument->WriteString(L"</w:p>");
m_arStates.top().m_bInP = false; m_arStates.top().m_bInP = false;
@ -448,6 +457,36 @@ void COOXMLWriter::CloseT()
m_arStates.top().m_bInT = false; m_arStates.top().m_bInT = false;
} }
void COOXMLWriter::CloseHyperlink()
{
if (!m_arStates.top().m_bInHyperlink)
return;
m_arStates.top().m_pCurrentDocument->WriteString(L"</w:hyperlink>");
m_arStates.top().m_bInHyperlink = false;
// Сноска
if (m_arStates.top().m_wsFootnote.empty())
return;
if (!m_arStates.top().m_bIsFootnote)
{
std::wstring sFootnoteID = std::to_wstring(m_nFootnoteId++);
OpenR();
m_arStates.top().m_pCurrentDocument->WriteString(L"<w:rPr><w:rStyle w:val=\"footnote\"/></w:rPr><w:footnoteReference w:id=\"");
m_arStates.top().m_pCurrentDocument->WriteString(sFootnoteID);
m_arStates.top().m_pCurrentDocument->WriteString(L"\"/>");
CloseR();
m_mFootnotes.insert(std::make_pair(m_arStates.top().m_wsFootnote, sFootnoteID));
}
else
{
OpenR();
m_arStates.top().m_pCurrentDocument->WriteString(L"<w:rPr><w:rStyle w:val=\"footnote\"/></w:rPr><w:footnoteRef/>");
CloseR();
}
}
void COOXMLWriter::BeginBlock() void COOXMLWriter::BeginBlock()
{ {
CloseP(); CloseP();
@ -534,7 +573,35 @@ void COOXMLWriter::Break(const std::vector<NSCSS::CNode>& arSelectors)
m_arStates.top().m_bWasSpace = true; m_arStates.top().m_bWasSpace = true;
} }
void COOXMLWriter::OpenCrossHyperlink(const std::wstring& wsRef, const std::vector<NSCSS::CNode>& arSelectors) void COOXMLWriter::SetHyperlinkData(const std::wstring& wsRef, const std::wstring& wsTooltip, bool bIsCross, const std::wstring& wsFootnote, bool bIsFootnote)
{
CloseHyperlink();
//TODO:: подумать как лучше сделать работу с гиперссылками
m_arStates.top().m_wsHref = wsRef;
m_arStates.top().m_wsTooltip = wsTooltip;
m_arStates.top().m_bISCrossHyperlink = bIsCross;
m_arStates.top().m_wsFootnote = wsFootnote;
m_arStates.top().m_bIsFootnote = bIsFootnote;
}
void COOXMLWriter::ClearHyperlinkData()
{
m_arStates.top().m_wsHref.clear();
m_arStates.top().m_wsTooltip.clear();
m_arStates.top().m_bISCrossHyperlink = false;
m_arStates.top().m_wsFootnote.clear();
m_arStates.top().m_bIsFootnote = false;
}
void COOXMLWriter::PageBreak()
{
OpenP();
GetCurrentDocument()->WriteString(L"<w:pPr><w:pageBreakBefore/></w:pPr>");
CloseP();
}
void COOXMLWriter::OpenCrossHyperlink(const std::wstring& wsRef)
{ {
m_arStates.top().m_pCurrentDocument->WriteString(L"<w:hyperlink w:anchor=\""); m_arStates.top().m_pCurrentDocument->WriteString(L"<w:hyperlink w:anchor=\"");
const size_t nSharp = wsRef.find('#'); const size_t nSharp = wsRef.find('#');
@ -552,7 +619,7 @@ void COOXMLWriter::OpenCrossHyperlink(const std::wstring& wsRef, const std::vect
m_arStates.top().m_pCurrentDocument->WriteString(L"\">"); m_arStates.top().m_pCurrentDocument->WriteString(L"\">");
} }
void COOXMLWriter::OpenExternalHyperlink(const std::wstring& wsRef, const std::wstring& wsTooltip, const std::vector<NSCSS::CNode>& arSelectors) void COOXMLWriter::OpenExternalHyperlink(const std::wstring& wsRef, const std::wstring& wsTooltip)
{ {
XmlString& oRelationshipXml(m_oDocXmlRels); XmlString& oRelationshipXml(m_oDocXmlRels);
@ -647,7 +714,6 @@ std::wstring COOXMLWriter::WritePPr(const std::vector<NSCSS::CNode>& arSelectors
break; break;
} }
if (sPStyle.empty() && m_arDivId.empty() && wsAnchor.empty()) if (sPStyle.empty() && m_arDivId.empty() && wsAnchor.empty())
return L""; return L"";
@ -662,6 +728,7 @@ std::wstring COOXMLWriter::WritePPr(const std::vector<NSCSS::CNode>& arSelectors
int nLiLevel{-1}; int nLiLevel{-1};
bool bNumberingLi{false}; bool bNumberingLi{false};
bool bInTable{false};
for (const NSCSS::CNode& oNode : arSelectors) for (const NSCSS::CNode& oNode : arSelectors)
{ {
@ -669,6 +736,11 @@ std::wstring COOXMLWriter::WritePPr(const std::vector<NSCSS::CNode>& arSelectors
bNumberingLi = true; bNumberingLi = true;
else if (L"ul" == oNode.m_wsName) else if (L"ul" == oNode.m_wsName)
bNumberingLi = false; bNumberingLi = false;
else if (L"table" == oNode.m_wsName)
{
bInTable = true;
continue;
}
else else
continue; continue;
@ -679,20 +751,15 @@ std::wstring COOXMLWriter::WritePPr(const std::vector<NSCSS::CNode>& arSelectors
m_arStates.top().m_pCurrentDocument->WriteString(L"<w:numPr><w:ilvl w:val=\"" + std::to_wstring(nLiLevel) + L"\"/><w:numId w:val=\"" + m_arStates.top().m_pCurrentDocument->WriteString(L"<w:numPr><w:ilvl w:val=\"" + std::to_wstring(nLiLevel) + L"\"/><w:numId w:val=\"" +
(!bNumberingLi ? L"1" : std::to_wstring(m_nListId)) + L"\"/></w:numPr>"); (!bNumberingLi ? L"1" : std::to_wstring(m_nListId)) + L"\"/></w:numPr>");
if (!m_arDivId.empty()) if (!m_arDivId.empty() && !bInTable)
m_arStates.top().m_pCurrentDocument->WriteString(L"<w:divId w:val=\"" + m_arDivId.top() + L"\"/>"); m_arStates.top().m_pCurrentDocument->WriteString(L"<w:divId w:val=\"" + m_arDivId.top() + L"\"/>");
// m_pCurrentDocument->WriteString(oTS.sPStyle + sPSettings);
m_arStates.top().m_pCurrentDocument->WriteNodeEnd(L"w:pPr"); m_arStates.top().m_pCurrentDocument->WriteNodeEnd(L"w:pPr");
m_arStates.top().m_bWasPStyle = true; m_arStates.top().m_bWasPStyle = true;
if (!wsAnchor.empty()) if (!wsAnchor.empty())
{
// const anchors_map::const_iterator itAnchor{m_mAnchors.find(wsAnchor)};
// if (m_mAnchors.cend() != itAnchor)
WriteEmptyBookmark(wsAnchor); WriteEmptyBookmark(wsAnchor);
}
return sPStyle; return sPStyle;
} }
@ -797,7 +864,6 @@ bool COOXMLWriter::WriteText(std::wstring wsText, const std::vector<NSCSS::CNode
const bool bInT = m_arStates.top().m_bInT; const bool bInT = m_arStates.top().m_bInT;
//TODO:: сделать так, чтобы параграф (со своими стилями) открывался при чтении сооответствующей ноды, а не при чтении текста
OpenP(); OpenP();
WritePPr(arSelectors); WritePPr(arSelectors);

View File

@ -66,7 +66,13 @@ class COOXMLWriter : public IWriter
bool m_bInT; // <w:t> открыт? bool m_bInT; // <w:t> открыт?
bool m_bWasPStyle; // <w:pStyle> записан? bool m_bWasPStyle; // <w:pStyle> записан?
bool m_bWasSpace; // Был пробел? bool m_bWasSpace; // Был пробел?
bool m_bInHyperlink; // <w:hyperlink> открыт? bool m_bInHyperlink; // <w:hyperlink> открыт?
std::wstring m_wsTooltip;
std::wstring m_wsHref;
std::wstring m_wsFootnote;
bool m_bIsFootnote;
bool m_bISCrossHyperlink;
XmlString *m_pCurrentDocument; //Текущее место записи XmlString *m_pCurrentDocument; //Текущее место записи
bool m_bRemoveCurrentDocument; bool m_bRemoveCurrentDocument;
@ -74,6 +80,7 @@ class COOXMLWriter : public IWriter
TState(XmlString *pCurrentDocument) TState(XmlString *pCurrentDocument)
: m_bInP(false), m_bInR(false), m_bInT(false), : m_bInP(false), m_bInR(false), m_bInT(false),
m_bWasPStyle(false), m_bWasSpace(true), m_bInHyperlink(false), m_bWasPStyle(false), m_bWasSpace(true), m_bInHyperlink(false),
m_bIsFootnote(false), m_bISCrossHyperlink(false),
m_pCurrentDocument(pCurrentDocument), m_bRemoveCurrentDocument(false) m_pCurrentDocument(pCurrentDocument), m_bRemoveCurrentDocument(false)
{} {}
@ -109,10 +116,7 @@ class COOXMLWriter : public IWriter
NSFonts::IApplicationFonts* m_pFonts; // Необходимо для оптимизации работы со шрифтами NSFonts::IApplicationFonts* m_pFonts; // Необходимо для оптимизации работы со шрифтами
public: public:
COOXMLWriter(); COOXMLWriter(THTMLParameters* pHTMLParameters = nullptr, NSCSS::CCssCalculator* pCSSCalculator = nullptr);
void SetCSSCalculator(NSCSS::CCssCalculator* pCSSCalculator);
void SetHTMLParameters(THTMLParameters* pHTMLParameters);
void SetSrcDirectory (const std::wstring& wsPath); void SetSrcDirectory (const std::wstring& wsPath);
void SetDstDirectory (const std::wstring& wsPath); void SetDstDirectory (const std::wstring& wsPath);
@ -126,10 +130,14 @@ public:
bool OpenP(); bool OpenP();
bool OpenR(); bool OpenR();
bool OpenT(); bool OpenT();
void OpenHyperlink();
void CloseP(); void CloseP();
void CloseR(); void CloseR();
void CloseT(); void CloseT();
void CloseHyperlink();
void PageBreak() override;
void BeginBlock() override; void BeginBlock() override;
void EndBlock(bool bAddBlock) override; void EndBlock(bool bAddBlock) override;
@ -144,8 +152,11 @@ public:
void Break(const std::vector<NSCSS::CNode>& arSelectors); void Break(const std::vector<NSCSS::CNode>& arSelectors);
void OpenCrossHyperlink(const std::wstring& wsRef, const std::vector<NSCSS::CNode>& arSelectors); void SetHyperlinkData(const std::wstring& wsRef, const std::wstring& wsTooltip, bool bIsCross, const std::wstring& wsFootnote, bool bIsFootnote);
void OpenExternalHyperlink(const std::wstring& wsRef, const std::wstring& wsTooltip, const std::vector<NSCSS::CNode>& arSelectors); void ClearHyperlinkData();
void OpenCrossHyperlink(const std::wstring& wsRef);
void OpenExternalHyperlink(const std::wstring& wsRef, const std::wstring& wsTooltip);
void CloseCrossHyperlink(const std::vector<NSCSS::CNode>& arSelectors, std::wstring wsFootnote, const std::wstring& wsRef); void CloseCrossHyperlink(const std::vector<NSCSS::CNode>& arSelectors, std::wstring wsFootnote, const std::wstring& wsRef);
void CloseExternalHyperlink(); void CloseExternalHyperlink();

View File

@ -5106,15 +5106,15 @@ HRESULT CHtmlFile2::ConvertHTML2Markdown(const std::wstring& wsPath, const std::
#endif #endif
} }
HRESULT CHtmlFile2::OpenMht(const std::wstring& sSrc, const std::wstring& sDst) HRESULT CHtmlFile2::ConvertMHT2OOXML(const std::wstring& wsPath, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs)
{ {
#ifdef USE_OLD_HTML_CONVERTER #ifdef USE_OLD_HTML_CONVERTER
if(!m_internal->m_oLightReader.IsValid()) if(!m_internal->m_oLightReader.IsValid())
if(!IsMhtFile(sSrc)) if(!IsMhtFile(wsPath))
return S_FALSE; return S_FALSE;
m_internal->m_sSrc = NSSystemPath::GetDirectoryName(sSrc); m_internal->m_sSrc = NSSystemPath::GetDirectoryName(wsPath);
m_internal->m_sDst = sDst; m_internal->m_sDst = sDirectory;
m_internal->CreateDocxEmpty(oParams); m_internal->CreateDocxEmpty(oParams);
m_internal->readStyle(); m_internal->readStyle();
@ -5128,22 +5128,37 @@ HRESULT CHtmlFile2::OpenMht(const std::wstring& sSrc, const std::wstring& sDst)
m_internal->write(); m_internal->write();
return S_OK; return S_OK;
#else #else
return S_FALSE; if (nullptr == m_pReader)
return S_FALSE;
return m_pReader->ConvertMHT2OOXML(wsPath, wsDirectory, pParametrs);
#endif #endif
} }
HRESULT CHtmlFile2::OpenBatchHtml(const std::vector<std::wstring>& sSrc, const std::wstring& sDst) HRESULT CHtmlFile2::ConvertMHT2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, HTML::TMarkdownParameters* pParametrs)
{ {
#ifdef USE_OLD_HTML_CONVERTER #ifdef USE_OLD_HTML_CONVERTER
m_internal->m_sDst = sDst; return S_FALSE;
#else
if (nullptr == m_pReader)
return S_FALSE;
return m_pReader->ConvertMHT2Markdown(wsPath, wsFinalFile, pParametrs);
#endif
}
HRESULT CHtmlFile2::ConvertHTML2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs)
{
#ifdef USE_OLD_HTML_CONVERTER
m_internal->m_sDst = wsDirectory;
m_internal->CreateDocxEmpty(oParams); m_internal->CreateDocxEmpty(oParams);
bool bFirst = true; bool bFirst = true;
for(const std::wstring& sS : sSrc) for(const std::wstring& sS : arPaths)
{ {
#ifdef _DEBUG #ifdef _DEBUG
std::wcout << NSFile::GetFileName(sS) << std::endl; std::wcout << NSFile::GetFileName(sS) << std::endl;
#endif #endif
m_internal->m_sSrc = NSSystemPath::GetDirectoryName(sS); m_internal->m_sSrc = NSSystemPath::GetDirectoryName(sS);
if(!IsHtmlFile(sS)) if(!IsHtmlFile(sS))
@ -5171,8 +5186,48 @@ HRESULT CHtmlFile2::OpenBatchHtml(const std::vector<std::wstring>& sSrc, const s
m_internal->write(); m_internal->write();
return S_OK; return S_OK;
#else
if (nullptr == m_pReader)
return S_FALSE;
return m_pReader->ConvertHTML2OOXML(arPaths, wsDirectory, pParametrs);
#endif #endif
}
HRESULT CHtmlFile2::ConvertHTML2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, HTML::TMarkdownParameters* pParametrs)
{
#ifdef USE_OLD_HTML_CONVERTER
return S_FALSE; return S_FALSE;
#else
if (nullptr == m_pReader)
return S_FALSE;
return m_pReader->ConvertHTML2Markdown(arPaths, wsFinalFile, pParametrs);
#endif
}
HRESULT CHtmlFile2::ConvertMHT2OOXML(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs)
{
#ifdef USE_OLD_HTML_CONVERTER
return S_FALSE;
#else
if (nullptr == m_pReader)
return S_FALSE;
return m_pReader->ConvertMHT2OOXML(arPaths, wsDirectory, pParametrs);
#endif
}
HRESULT CHtmlFile2::ConvertMHT2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, HTML::TMarkdownParameters* pParametrs)
{
#ifdef USE_OLD_HTML_CONVERTER
return S_FALSE;
#else
if (nullptr == m_pReader)
return S_FALSE;
return m_pReader->ConvertMHT2Markdown(arPaths, wsFinalFile, pParametrs);
#endif
} }
#ifdef USE_OLD_HTML_CONVERTER #ifdef USE_OLD_HTML_CONVERTER

View File

@ -44,17 +44,14 @@ public:
HRESULT ConvertHTML2OOXML (const std::wstring& wsPath, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs = nullptr); HRESULT ConvertHTML2OOXML (const std::wstring& wsPath, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs = nullptr);
HRESULT ConvertHTML2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, HTML::TMarkdownParameters* pParametrs = nullptr); HRESULT ConvertHTML2Markdown(const std::wstring& wsPath, const std::wstring& wsFinalFile, HTML::TMarkdownParameters* pParametrs = nullptr);
HRESULT ConvertMHT2OOXML (const std::wstring& sPath, const std::wstring& sDirectory, HTML::THTMLParameters* pParametrs = nullptr); HRESULT ConvertMHT2OOXML (const std::wstring& wsPath, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs = nullptr);
HRESULT ConvertMHT2Markdown (const std::wstring& sPath, const std::wstring& sDirectory, HTML::TMarkdownParameters* pParametrs = nullptr); HRESULT ConvertMHT2Markdown (const std::wstring& wsPath, const std::wstring& wsFinalFile, HTML::TMarkdownParameters* pParametrs = nullptr);
HRESULT ConvertHTML2OOXML (const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs = nullptr); HRESULT ConvertHTML2OOXML (const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs = nullptr);
HRESULT ConvertHTML2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, HTML::TMarkdownParameters* pParametrs = nullptr); HRESULT ConvertHTML2Markdown(const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, HTML::TMarkdownParameters* pParametrs = nullptr);
HRESULT ConvertMHT2OOXML (const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs = nullptr); HRESULT ConvertMHT2OOXML (const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, HTML::THTMLParameters* pParametrs = nullptr);
HRESULT ConvertMHT2Markdown (const std::vector<std::wstring>& arPaths, const std::wstring& wsDirectory, HTML::TMarkdownParameters* pParametrs = nullptr); HRESULT ConvertMHT2Markdown (const std::vector<std::wstring>& arPaths, const std::wstring& wsFinalFile, HTML::TMarkdownParameters* pParametrs = nullptr);
HRESULT OpenMht (const std::wstring& sPath, const std::wstring& sDirectory);
HRESULT OpenBatchHtml(const std::vector<std::wstring>& sPath, const std::wstring& sDirectory);
}; };
#endif // _HTMLFILE2_HTMLFILE2_H #endif // _HTMLFILE2_HTMLFILE2_H

View File

@ -198,6 +198,8 @@ namespace NExtractTools
DECLARE_CONVERT_FUNC(doct_bin2html); DECLARE_CONVERT_FUNC(doct_bin2html);
DECLARE_CONVERT_FUNC(doct_bin2html_zip); DECLARE_CONVERT_FUNC(doct_bin2html_zip);
DECLARE_CONVERT_FUNC(html2md);
// mht // mht
DECLARE_CONVERT_FUNC(mht2docx_dir); DECLARE_CONVERT_FUNC(mht2docx_dir);

View File

@ -83,7 +83,7 @@ namespace NExtractTools
params.m_bMacro = false; params.m_bMacro = false;
CHtmlFile2 oFile; CHtmlFile2 oFile;
oFile.SetTmpDirectory(convertParams.m_sTempDir); oFile.SetTempDirectory(convertParams.m_sTempDir);
return (S_OK == oFile.OpenBatchHtml(arFiles, sTo)) ? 0 : AVS_FILEUTILS_ERROR_CONVERT; return (S_OK == oFile.OpenBatchHtml(arFiles, sTo)) ? 0 : AVS_FILEUTILS_ERROR_CONVERT;
} }
_UINT32 html2docx_dir(const std::wstring& sFrom, const std::wstring& sTo, InputParams& params, ConvertParams& convertParams) _UINT32 html2docx_dir(const std::wstring& sFrom, const std::wstring& sTo, InputParams& params, ConvertParams& convertParams)
@ -129,7 +129,7 @@ namespace NExtractTools
_UINT32 mht2docx_dir(const std::wstring& sFrom, const std::wstring& sTo, InputParams& params, ConvertParams& convertParams) _UINT32 mht2docx_dir(const std::wstring& sFrom, const std::wstring& sTo, InputParams& params, ConvertParams& convertParams)
{ {
CHtmlFile2 oFile; CHtmlFile2 oFile;
oFile.SetTmpDirectory(convertParams.m_sTempDir); oFile.SetTempDirectory(convertParams.m_sTempDir);
return (S_OK == oFile.OpenMht(sFrom, sTo)) ? 0 : AVS_FILEUTILS_ERROR_CONVERT; return (S_OK == oFile.OpenMht(sFrom, sTo)) ? 0 : AVS_FILEUTILS_ERROR_CONVERT;
} }
// epub // epub
@ -258,4 +258,12 @@ namespace NExtractTools
{ {
return Md::ConvertMdFileToHtml(sFrom, sTo) ? S_OK : AVS_FILEUTILS_ERROR_CONVERT; return Md::ConvertMdFileToHtml(sFrom, sTo) ? S_OK : AVS_FILEUTILS_ERROR_CONVERT;
} }
// html -> markdown
_UINT32 html2md(const std::wstring& sFrom, const std::wstring& sTo, InputParams& params, ConvertParams& convertParams)
{
CHtmlFile2 oFile;
oFile.SetTempDirectory(convertParams.m_sTempDir);
return (S_OK == oFile.ConvertHTML2Markdown(sFrom, sTo)) ? 0 : AVS_FILEUTILS_ERROR_CONVERT;
}
} }