mirror of
https://github.com/ONLYOFFICE/core.git
synced 2026-04-07 13:55:33 +08:00
673 lines
20 KiB
C++
673 lines
20 KiB
C++
#ifndef HTMLTOXHTML_H
|
||
#define HTMLTOXHTML_H
|
||
|
||
#include <string>
|
||
#include <map>
|
||
#include <cctype>
|
||
#include <vector>
|
||
#include <algorithm>
|
||
|
||
#include "gumbo-parser/src/gumbo.h"
|
||
#include "../../../DesktopEditor/common/File.h"
|
||
#include "../../../DesktopEditor/common/Directory.h"
|
||
#include "../../../DesktopEditor/common/StringBuilder.h"
|
||
#include "../../../UnicodeConverter/UnicodeConverter.h"
|
||
|
||
static std::string nonbreaking_inline = "|a|abbr|acronym|b|bdo|big|cite|code|dfn|em|font|i|img|kbd|nobr|s|small|span|strike|strong|sub|sup|tt|";
|
||
static std::string empty_tags = "|area|base|basefont|bgsound|br|command|col|embed|event-source|frame|hr|image|img|input|keygen|link|menuitem|meta|param|source|spacer|track|wbr|";
|
||
static std::string preserve_whitespace = "|pre|textarea|script|style|";
|
||
static std::string special_handling = "|html|body|";
|
||
static std::string no_entity_sub = ""; //"|style|";
|
||
static std::string treat_like_inline = "|p|";
|
||
|
||
static void prettyprint(GumboNode*, NSStringUtils::CStringBuilderA& oBuilder);
|
||
static std::string mhtTohtml(std::string& sFileContent);
|
||
|
||
// Заменяет в строке s все символы s1 на s2
|
||
static void replace_all(std::string& s, const std::string& s1, const std::string& s2)
|
||
{
|
||
size_t pos = s.find(s1);
|
||
while(pos != std::string::npos)
|
||
{
|
||
s.replace(pos, s1.length(), s2);
|
||
pos = s.find(s1, pos + s2.length());
|
||
}
|
||
}
|
||
|
||
static std::wstring htmlToXhtml(std::string& sFileContent, bool bNeedConvert)
|
||
{
|
||
// Распознование кодировки
|
||
if (bNeedConvert)
|
||
{
|
||
size_t posEncoding = sFileContent.find("charset=");
|
||
if (posEncoding == std::string::npos)
|
||
posEncoding = sFileContent.find("encoding=");
|
||
if (posEncoding != std::string::npos)
|
||
{
|
||
posEncoding = sFileContent.find("=", posEncoding) + 1;
|
||
char quoteSymbol = '\"';
|
||
if(sFileContent[posEncoding] == '\"' || sFileContent[posEncoding] == '\'')
|
||
{
|
||
quoteSymbol = sFileContent[posEncoding];
|
||
posEncoding += 1;
|
||
}
|
||
|
||
size_t posEnd = sFileContent.find(quoteSymbol, posEncoding);
|
||
if (std::string::npos != posEnd)
|
||
{
|
||
std::string sEncoding = sFileContent.substr(posEncoding, posEnd - posEncoding);
|
||
if (sEncoding != "utf-8" && sEncoding != "UTF-8")
|
||
{
|
||
NSUnicodeConverter::CUnicodeConverter oConverter;
|
||
sFileContent = U_TO_UTF8(oConverter.toUnicode(sFileContent, sEncoding.c_str()));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Избавление от <a/>
|
||
size_t posA = sFileContent.find("<a ");
|
||
while(posA != std::string::npos)
|
||
{
|
||
size_t nBegin = sFileContent.find('<', posA + 1);
|
||
size_t nEnd = sFileContent.find("/>", posA);
|
||
if(nEnd < nBegin)
|
||
sFileContent.replace(nEnd, 2, "></a>");
|
||
posA = sFileContent.find("<a ", nBegin);
|
||
}
|
||
// Избавление от <title/>
|
||
posA = sFileContent.find("<title/>");
|
||
while (posA != std::string::npos)
|
||
{
|
||
sFileContent.replace(posA, 8, "<title></title>");
|
||
posA = sFileContent.find("<title/>", posA);
|
||
}
|
||
// Избавление от <script/>
|
||
posA = sFileContent.find("<script");
|
||
while (posA != std::string::npos)
|
||
{
|
||
size_t nEnd = 0;
|
||
size_t nEnd1 = sFileContent.find("/>", posA);
|
||
size_t nEnd2 = sFileContent.find("</script>", posA);
|
||
if (nEnd1 != std::string::npos)
|
||
nEnd = nEnd1 + 2;
|
||
if (nEnd2 != std::string::npos && (nEnd == 0 || (nEnd > 0 && nEnd2 < nEnd)))
|
||
nEnd = nEnd2 + 9;
|
||
|
||
sFileContent.erase(posA, nEnd - posA);
|
||
|
||
posA = sFileContent.find("<script", posA);
|
||
}
|
||
|
||
// Gumbo
|
||
GumboOptions options = kGumboDefaultOptions;
|
||
GumboOutput* output = gumbo_parse_with_options(&options, sFileContent.data(), sFileContent.length());
|
||
|
||
// prettyprint
|
||
NSStringUtils::CStringBuilderA oBuilder;
|
||
prettyprint(output->document, oBuilder);
|
||
|
||
// Конвертирование из string utf8 в wstring
|
||
return UTF8_TO_U(oBuilder.GetData());
|
||
}
|
||
|
||
static std::string Base64ToString(const std::string& sContent, const std::string& sCharset)
|
||
{
|
||
std::string sRes;
|
||
int nSrcLen = (int)sContent.length();
|
||
int nDecodeLen = NSBase64::Base64DecodeGetRequiredLength(nSrcLen);
|
||
BYTE* pData = new BYTE[nDecodeLen];
|
||
if (TRUE == NSBase64::Base64Decode(sContent.c_str(), nSrcLen, pData, &nDecodeLen))
|
||
{
|
||
std::wstring sConvert;
|
||
if(!sCharset.empty() && sCharset != "utf-8" && sCharset != "UTF-8")
|
||
{
|
||
NSUnicodeConverter::CUnicodeConverter oConverter;
|
||
sConvert = oConverter.toUnicode(reinterpret_cast<char *>(pData), (unsigned)nDecodeLen, sCharset.data());
|
||
}
|
||
sRes = sConvert.empty() ? std::string(reinterpret_cast<char *>(pData), nDecodeLen) : U_TO_UTF8(sConvert);
|
||
}
|
||
RELEASEARRAYOBJECTS(pData);
|
||
return sRes;
|
||
}
|
||
|
||
static std::string QuotedPrintableDecode(const std::string& sContent, std::string& sCharset)
|
||
{
|
||
NSStringUtils::CStringBuilderA sRes;
|
||
size_t ip = 0;
|
||
size_t i = sContent.find('=');
|
||
|
||
if(i == 0)
|
||
{
|
||
size_t nIgnore = 12;
|
||
std::string charset = sContent.substr(0, nIgnore);
|
||
if(charset == "=00=00=FE=FF")
|
||
sCharset = "UTF-32BE";
|
||
else if(charset == "=FF=FE=00=00")
|
||
sCharset = "UTF-32LE";
|
||
else if(charset == "=2B=2F=76=38" || charset == "=2B=2F=76=39" ||
|
||
charset == "=2B=2F=76=2B" || charset == "=2B=2F=76=2F")
|
||
sCharset = "UTF-7";
|
||
else if(charset == "=DD=73=66=73")
|
||
sCharset = "UTF-EBCDIC";
|
||
else if(charset == "=84=31=95=33")
|
||
sCharset = "GB-18030";
|
||
else
|
||
{
|
||
nIgnore -= 3;
|
||
charset.erase(nIgnore);
|
||
if(charset == "=EF=BB=BF")
|
||
sCharset = "UTF-8";
|
||
else if(charset == "=F7=64=4C")
|
||
sCharset = "UTF-1";
|
||
else if(charset == "=0E=FE=FF")
|
||
sCharset = "SCSU";
|
||
else if(charset == "=FB=EE=28")
|
||
sCharset = "BOCU-1";
|
||
else
|
||
{
|
||
nIgnore -= 3;
|
||
charset.erase(nIgnore);
|
||
if(charset == "=FE=FF")
|
||
sCharset = "UTF-16BE";
|
||
else if(charset == "=FF=FE")
|
||
sCharset = "UTF-16LE";
|
||
else
|
||
nIgnore -= 6;
|
||
}
|
||
}
|
||
|
||
ip = nIgnore;
|
||
i = sContent.find('=', ip);
|
||
}
|
||
|
||
while(i != std::string::npos && i + 2 < sContent.length())
|
||
{
|
||
sRes.WriteString(sContent.c_str() + ip, i - ip);
|
||
std::string str = sContent.substr(i + 1, 2);
|
||
if(str.front() == '\n' || str.front() == '\r')
|
||
{
|
||
char ch = str[1];
|
||
if(ch != '\n' && ch != '\r')
|
||
sRes.WriteString(&ch, 1);
|
||
}
|
||
else
|
||
{
|
||
char* err;
|
||
char ch = (int)strtol(str.data(), &err, 16);
|
||
if(*err)
|
||
sRes.WriteString('=' + str);
|
||
else
|
||
sRes.WriteString(&ch, 1);
|
||
}
|
||
ip = i + 3;
|
||
i = sContent.find('=', ip);
|
||
}
|
||
if(ip != std::string::npos)
|
||
sRes.WriteString(sContent.c_str() + ip);
|
||
return sRes.GetData();
|
||
}
|
||
|
||
static void ReadMht(std::string& sFileContent, size_t& nFound, size_t& nNextFound, const std::string& sBoundary,
|
||
std::map<std::string, std::string>& sRes, NSStringUtils::CStringBuilderA& oRes)
|
||
{
|
||
// Content
|
||
size_t nContentTag = sFileContent.find("\n\n", nFound);
|
||
if(nContentTag == std::string::npos || nContentTag > nNextFound)
|
||
{
|
||
nContentTag = sFileContent.find("\r\r", nFound);
|
||
if(nContentTag == std::string::npos || nContentTag > nNextFound)
|
||
{
|
||
nContentTag = sFileContent.find("\r\n\r\n", nFound);
|
||
if(nContentTag == std::string::npos || nContentTag > nNextFound)
|
||
{
|
||
nFound = nNextFound;
|
||
return;
|
||
}
|
||
else
|
||
nContentTag += 4;
|
||
}
|
||
else
|
||
nContentTag += 2;
|
||
}
|
||
else
|
||
nContentTag += 2;
|
||
|
||
// Content-Type
|
||
size_t nTag = sFileContent.find("Content-Type: ", nFound);
|
||
if(nTag == std::string::npos || nTag > nContentTag)
|
||
{
|
||
nFound = nNextFound;
|
||
return;
|
||
}
|
||
size_t nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
|
||
nTag += 14;
|
||
if(nTagEnd == std::string::npos || nTagEnd > nContentTag)
|
||
{
|
||
nFound = nNextFound;
|
||
return;
|
||
}
|
||
std::string sContentType = sFileContent.substr(nTag, nTagEnd - nTag);
|
||
if(sContentType == "multipart/alternative")
|
||
nContentTag = nFound;
|
||
|
||
// name
|
||
std::string sName;
|
||
nTag = sFileContent.find(" name=", nFound);
|
||
if(nTag != std::string::npos && nTag < nContentTag)
|
||
{
|
||
nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
|
||
nTag += 6;
|
||
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
|
||
sName = sFileContent.substr(nTag, nTagEnd - nTag);
|
||
}
|
||
|
||
// charset
|
||
std::string sCharset;
|
||
nTag = sFileContent.find("charset=", nFound);
|
||
if(nTag != std::string::npos && nTag < nContentTag)
|
||
{
|
||
nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
|
||
nTag += 8;
|
||
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
|
||
{
|
||
if(sFileContent[nTag] == '\"')
|
||
{
|
||
nTag++;
|
||
nTagEnd--;
|
||
}
|
||
sCharset = sFileContent.substr(nTag, nTagEnd - nTag);
|
||
}
|
||
}
|
||
|
||
// Content-Location
|
||
std::string sContentLocation;
|
||
nTag = sFileContent.find("Content-Location: ", nFound);
|
||
if(nTag != std::string::npos && nTag < nContentTag)
|
||
{
|
||
nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
|
||
nTag += 18;
|
||
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
|
||
sContentLocation = sFileContent.substr(nTag, nTagEnd - nTag);
|
||
}
|
||
|
||
if (sContentLocation.empty())
|
||
{
|
||
// Content-ID
|
||
std::string sContentID;
|
||
nTag = sFileContent.find("Content-ID: <", nFound);
|
||
if(nTag != std::string::npos && nTag < nContentTag)
|
||
{
|
||
nTagEnd = sFileContent.find_first_of(">", nTag);
|
||
nTag += 13;
|
||
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
|
||
sContentID = sFileContent.substr(nTag, nTagEnd - nTag);
|
||
}
|
||
|
||
if (!sContentID.empty())
|
||
sContentLocation = "cid:" + sContentID;
|
||
}
|
||
|
||
// Content-Transfer-Encoding
|
||
std::string sContentEncoding;
|
||
nTag = sFileContent.find("Content-Transfer-Encoding: ", nFound);
|
||
if(nTag != std::string::npos && nTag < nContentTag)
|
||
{
|
||
nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
|
||
nTag += 27;
|
||
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
|
||
sContentEncoding = sFileContent.substr(nTag, nTagEnd - nTag);
|
||
}
|
||
|
||
// Content
|
||
nTagEnd = nNextFound - 2;
|
||
if(nTagEnd == std::string::npos || nTagEnd < nContentTag)
|
||
{
|
||
nFound = nNextFound;
|
||
return;
|
||
}
|
||
std::string sContent = sFileContent.substr(nContentTag, nTagEnd - nContentTag);
|
||
|
||
// Удаляем лишнее
|
||
sFileContent.erase(0, nNextFound);
|
||
nFound = sFileContent.find(sBoundary);
|
||
|
||
std::wstring sExtention = NSFile::GetFileExtention(UTF8_TO_U(sName));
|
||
std::transform(sExtention.begin(), sExtention.end(), sExtention.begin(), tolower);
|
||
// Основной документ
|
||
if(sContentType == "multipart/alternative")
|
||
oRes.WriteString(mhtTohtml(sContent));
|
||
else if((sContentType.find("text") != std::string::npos && (sExtention.empty() || sExtention == L"htm" || sExtention == L"html" || sExtention
|
||
== L"xhtml" || sExtention == L"css")) || (sContentType == "application/octet-stream" && (sContentLocation.find("css") !=
|
||
std::string::npos)))
|
||
{
|
||
// Стили заключаются в тэг <style>
|
||
if(sContentType == "text/css" || sExtention == L"css" || sContentLocation.find("css") != std::string::npos)
|
||
oRes.WriteString("<style>");
|
||
if(sContentEncoding == "Base64" || sContentEncoding == "base64")
|
||
oRes.WriteString(Base64ToString(sContent, sCharset));
|
||
else if(sContentEncoding == "8bit" || sContentEncoding == "7bit" || sContentEncoding.empty())
|
||
{
|
||
if (sCharset != "utf-8" && sCharset != "UTF-8" && !sCharset.empty())
|
||
{
|
||
NSUnicodeConverter::CUnicodeConverter oConverter;
|
||
sContent = U_TO_UTF8(oConverter.toUnicode(sContent, sCharset.data()));
|
||
}
|
||
oRes.WriteString(sContent);
|
||
}
|
||
else if(sContentEncoding == "quoted-printable" || sContentEncoding == "Quoted-Printable")
|
||
{
|
||
sContent = QuotedPrintableDecode(sContent, sCharset);
|
||
if (sCharset != "utf-8" && sCharset != "UTF-8" && !sCharset.empty())
|
||
{
|
||
NSUnicodeConverter::CUnicodeConverter oConverter;
|
||
sContent = U_TO_UTF8(oConverter.toUnicode(sContent, sCharset.data()));
|
||
}
|
||
oRes.WriteString(sContent);
|
||
}
|
||
if(sContentType == "text/css" || sExtention == L"css" || sContentLocation.find("css") != std::string::npos)
|
||
oRes.WriteString("</style>");
|
||
}
|
||
// Картинки
|
||
else if((sContentType.find("image") != std::string::npos || sExtention == L"gif" || sContentType == "application/octet-stream") &&
|
||
(sContentEncoding == "Base64" || sContentEncoding == "base64"))
|
||
{
|
||
if(sExtention == L"ico" || sContentType.find("ico") != std::string::npos)
|
||
sContentType = "image/jpg";
|
||
else if(sExtention == L"gif")
|
||
sContentType = "image/gif";
|
||
int nSrcLen = (int)sContent.length();
|
||
int nDecodeLen = NSBase64::Base64DecodeGetRequiredLength(nSrcLen);
|
||
BYTE* pData = new BYTE[nDecodeLen];
|
||
if (TRUE == NSBase64::Base64Decode(sContent.c_str(), nSrcLen, pData, &nDecodeLen))
|
||
sRes.insert(std::make_pair(sContentLocation, "data:" + sContentType + ";base64," + sContent));
|
||
RELEASEARRAYOBJECTS(pData);
|
||
}
|
||
}
|
||
|
||
static std::string mhtTohtml(std::string& sFileContent)
|
||
{
|
||
std::map<std::string, std::string> sRes;
|
||
NSStringUtils::CStringBuilderA oRes;
|
||
|
||
// Поиск boundary
|
||
size_t nFound = sFileContent.find("boundary=");
|
||
if(nFound == std::string::npos)
|
||
{
|
||
size_t nFoundEnd = sFileContent.length();
|
||
nFound = 0;
|
||
ReadMht(sFileContent, nFound, nFoundEnd, "no", sRes, oRes);
|
||
return oRes.GetData();
|
||
}
|
||
size_t nFoundEnd = sFileContent.find_first_of(";\n\r", nFound);
|
||
if(nFoundEnd == std::string::npos)
|
||
return "";
|
||
nFound += 9;
|
||
if(sFileContent[nFound] == '\"')
|
||
{
|
||
nFound++;
|
||
nFoundEnd--;
|
||
}
|
||
if(nFound > nFoundEnd)
|
||
return "";
|
||
std::string sBoundary = sFileContent.substr(nFound, nFoundEnd - nFound);
|
||
size_t nBoundaryLength = sBoundary.length();
|
||
|
||
// Удаляем лишнее
|
||
nFound = sFileContent.find(sBoundary, nFoundEnd);
|
||
sFileContent.erase(0, nFound);
|
||
|
||
// Цикл по boundary
|
||
nFound = 0;
|
||
while(nFound != std::string::npos)
|
||
{
|
||
// Выход по --boundary--
|
||
if(sFileContent[nFound + nBoundaryLength + 1] == '-')
|
||
break;
|
||
nFoundEnd = sFileContent.find(sBoundary, nFound + nBoundaryLength);
|
||
if(nFoundEnd == std::string::npos)
|
||
break;
|
||
ReadMht(sFileContent, nFound, nFoundEnd, sBoundary, sRes, oRes);
|
||
}
|
||
std::string sFile = oRes.GetData();
|
||
for(const std::pair<std::string, std::string>& item : sRes)
|
||
{
|
||
std::string sName = item.first;
|
||
size_t found = sFile.find(sName);
|
||
size_t sfound = sName.rfind('/');
|
||
if(found == std::string::npos && sfound != std::string::npos)
|
||
found = sFile.find(sName.erase(0, sfound + 1));
|
||
while(found != std::string::npos)
|
||
{
|
||
size_t fq = sFile.find_last_of("\"\'>=", found);
|
||
char ch = sFile[fq];
|
||
if(ch != '\"' && ch != '\'')
|
||
fq++;
|
||
size_t tq = sFile.find_first_of("\"\'<> ", found) + 1;
|
||
if(sFile[tq] != '\"' && sFile[tq] != '\'')
|
||
tq--;
|
||
if(ch != '>')
|
||
{
|
||
std::string is = '\"' + item.second + '\"';
|
||
sFile.replace(fq, tq - fq, is);
|
||
found = sFile.find(sName, fq + is.length());
|
||
}
|
||
else
|
||
found = sFile.find(sName, tq);
|
||
}
|
||
}
|
||
return sFile;
|
||
}
|
||
|
||
static std::wstring mhtToXhtml(std::string& sFileContent)
|
||
{
|
||
sFileContent = mhtTohtml(sFileContent);
|
||
|
||
// Gumbo
|
||
GumboOptions options = kGumboDefaultOptions;
|
||
GumboOutput* output = gumbo_parse_with_options(&options, sFileContent.data(), sFileContent.length());
|
||
|
||
// prettyprint
|
||
NSStringUtils::CStringBuilderA oBuilder;
|
||
prettyprint(output->document, oBuilder);
|
||
|
||
// Конвертирование из string utf8 в wstring
|
||
return UTF8_TO_U(oBuilder.GetData());
|
||
}
|
||
|
||
// Заменяет сущности &,<,> в text
|
||
static void substitute_xml_entities_into_text(std::string& text)
|
||
{
|
||
// replacing & must come first
|
||
replace_all(text, "&", "&");
|
||
replace_all(text, "<", "<");
|
||
replace_all(text, ">", ">");
|
||
}
|
||
|
||
// Заменяет сущности " в text
|
||
static void substitute_xml_entities_into_attributes(std::string& text)
|
||
{
|
||
substitute_xml_entities_into_text(text);
|
||
replace_all(text, "\"", """);
|
||
}
|
||
|
||
static std::string handle_unknown_tag(GumboStringPiece* text)
|
||
{
|
||
if (text->data == NULL)
|
||
return "";
|
||
GumboStringPiece gsp = *text;
|
||
gumbo_tag_from_original_text(&gsp);
|
||
std::string sAtr = std::string(gsp.data, gsp.length);
|
||
size_t found = sAtr.find_first_of("-'+,./=?;!*#@$_%<>&;\"\'()[]{}");
|
||
while(found != std::string::npos)
|
||
{
|
||
sAtr.erase(found, 1);
|
||
found = sAtr.find_first_of("-'+,./=?;!*#@$_%<>&;\"\'()[]{}", found);
|
||
}
|
||
return sAtr;
|
||
}
|
||
|
||
static std::string get_tag_name(GumboNode* node)
|
||
{
|
||
std::string tagname = (node->type == GUMBO_NODE_DOCUMENT ? "document" : gumbo_normalized_tagname(node->v.element.tag));
|
||
if (tagname.empty())
|
||
tagname = handle_unknown_tag(&node->v.element.original_tag);
|
||
return tagname;
|
||
}
|
||
|
||
static void build_doctype(GumboNode* node, NSStringUtils::CStringBuilderA& oBuilder)
|
||
{
|
||
if (node->v.document.has_doctype)
|
||
{
|
||
oBuilder.WriteString("<!DOCTYPE ");
|
||
oBuilder.WriteString(node->v.document.name);
|
||
std::string pi(node->v.document.public_identifier);
|
||
if ((node->v.document.public_identifier != NULL) && !pi.empty())
|
||
{
|
||
oBuilder.WriteString(" PUBLIC \"");
|
||
oBuilder.WriteString(pi);
|
||
oBuilder.WriteString("\" \"");
|
||
oBuilder.WriteString(node->v.document.system_identifier);
|
||
oBuilder.WriteString("\"");
|
||
}
|
||
oBuilder.WriteString(">");
|
||
}
|
||
}
|
||
|
||
static void build_attributes(const GumboVector* attribs, bool no_entities, NSStringUtils::CStringBuilderA& atts)
|
||
{
|
||
std::vector<std::string> arrRepeat;
|
||
for (size_t i = 0; i < attribs->length; ++i)
|
||
{
|
||
GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]);
|
||
std::string sVal(at->value);
|
||
std::string sName(at->name);
|
||
atts.WriteString(" ");
|
||
|
||
bool bCheck = false;
|
||
size_t nBad = sName.find_first_of("+,.=?#%<>&;\"\'()[]{}");
|
||
while(nBad != std::string::npos)
|
||
{
|
||
sName.erase(nBad, 1);
|
||
nBad = sName.find_first_of("+,.=?#%<>&;\"\'()[]{}", nBad);
|
||
if(sName.empty())
|
||
break;
|
||
bCheck = true;
|
||
}
|
||
if(sName.empty())
|
||
continue;
|
||
while(sName.front() >= '0' && sName.front() <= '9')
|
||
{
|
||
sName.erase(0, 1);
|
||
if(sName.empty())
|
||
break;
|
||
bCheck = true;
|
||
}
|
||
if(bCheck)
|
||
{
|
||
GumboAttribute* check = gumbo_get_attribute(attribs, sName.c_str());
|
||
if(check || std::find(arrRepeat.begin(), arrRepeat.end(), sName) != arrRepeat.end())
|
||
continue;
|
||
else
|
||
arrRepeat.push_back(sName);
|
||
}
|
||
|
||
if(sName.empty())
|
||
continue;
|
||
atts.WriteString(sName);
|
||
|
||
// determine original quote character used if it exists
|
||
std::string qs ="\"";
|
||
atts.WriteString("=");
|
||
atts.WriteString(qs);
|
||
if(!no_entities)
|
||
substitute_xml_entities_into_attributes(sVal);
|
||
atts.WriteString(sVal);
|
||
atts.WriteString(qs);
|
||
}
|
||
}
|
||
|
||
static void prettyprint_contents(GumboNode* node, NSStringUtils::CStringBuilderA& contents)
|
||
{
|
||
std::string key = "|" + get_tag_name(node) + "|";
|
||
bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
|
||
bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos;
|
||
bool is_inline = nonbreaking_inline.find(key) != std::string::npos;
|
||
bool is_like_inline = treat_like_inline.find(key) != std::string::npos;
|
||
|
||
GumboVector* children = &node->v.element.children;
|
||
|
||
for (size_t i = 0; i < children->length; i++)
|
||
{
|
||
GumboNode* child = static_cast<GumboNode*> (children->data[i]);
|
||
|
||
if (child->type == GUMBO_NODE_TEXT)
|
||
{
|
||
std::string val(child->v.text.text);
|
||
if(!no_entity_substitution)
|
||
substitute_xml_entities_into_text(val);
|
||
|
||
// Избавление от FF
|
||
size_t found = val.find_first_of("\014");
|
||
while(found != std::string::npos)
|
||
{
|
||
val.erase(found, 1);
|
||
found = val.find_first_of("\014", found);
|
||
}
|
||
|
||
contents.WriteString(val);
|
||
}
|
||
else if ((child->type == GUMBO_NODE_ELEMENT) || (child->type == GUMBO_NODE_TEMPLATE))
|
||
prettyprint(child, contents);
|
||
else if (child->type == GUMBO_NODE_WHITESPACE)
|
||
{
|
||
if (keep_whitespace || is_inline || is_like_inline)
|
||
contents.WriteString(child->v.text.text);
|
||
}
|
||
else if (child->type != GUMBO_NODE_COMMENT)
|
||
{
|
||
// Сообщение об ошибке
|
||
// Does this actually exist: (child->type == GUMBO_NODE_CDATA)
|
||
// fprintf(stderr, "unknown element of type: %d\n", child->type);
|
||
}
|
||
}
|
||
}
|
||
|
||
static void prettyprint(GumboNode* node, NSStringUtils::CStringBuilderA& oBuilder)
|
||
{
|
||
// special case the document node
|
||
if (node->type == GUMBO_NODE_DOCUMENT)
|
||
{
|
||
build_doctype(node, oBuilder);
|
||
prettyprint_contents(node, oBuilder);
|
||
return;
|
||
}
|
||
|
||
std::string close = "";
|
||
std::string closeTag = "";
|
||
std::string tagname = get_tag_name(node);
|
||
std::string key = "|" + tagname + "|";
|
||
bool is_empty_tag = empty_tags.find(key) != std::string::npos;
|
||
bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
|
||
|
||
// determine closing tag type
|
||
if (is_empty_tag)
|
||
close = "/";
|
||
else
|
||
closeTag = "</" + tagname + ">";
|
||
|
||
// build results
|
||
oBuilder.WriteString("<" + tagname);
|
||
|
||
// build attr string
|
||
const GumboVector* attribs = &node->v.element.attributes;
|
||
build_attributes(attribs, no_entity_substitution, oBuilder);
|
||
oBuilder.WriteString(close + ">");
|
||
|
||
// prettyprint your contents
|
||
prettyprint_contents(node, oBuilder);
|
||
oBuilder.WriteString(closeTag);
|
||
}
|
||
|
||
#endif // HTMLTOXHTML_H
|