Files
core/Common/3dParty/html/htmltoxhtml.h
Svetlana Kulikova 4c47e0e6b0 Fox bug 64115
Fix empty file because of script tag
2023-09-12 17:57:08 +03:00

673 lines
20 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#ifndef HTMLTOXHTML_H
#define HTMLTOXHTML_H
#include <string>
#include <map>
#include <cctype>
#include <vector>
#include <algorithm>
#include "gumbo-parser/src/gumbo.h"
#include "../../../DesktopEditor/common/File.h"
#include "../../../DesktopEditor/common/Directory.h"
#include "../../../DesktopEditor/common/StringBuilder.h"
#include "../../../UnicodeConverter/UnicodeConverter.h"
static std::string nonbreaking_inline = "|a|abbr|acronym|b|bdo|big|cite|code|dfn|em|font|i|img|kbd|nobr|s|small|span|strike|strong|sub|sup|tt|";
static std::string empty_tags = "|area|base|basefont|bgsound|br|command|col|embed|event-source|frame|hr|image|img|input|keygen|link|menuitem|meta|param|source|spacer|track|wbr|";
static std::string preserve_whitespace = "|pre|textarea|script|style|";
static std::string special_handling = "|html|body|";
static std::string no_entity_sub = ""; //"|style|";
static std::string treat_like_inline = "|p|";
static void prettyprint(GumboNode*, NSStringUtils::CStringBuilderA& oBuilder);
static std::string mhtTohtml(std::string& sFileContent);
// Заменяет в строке s все символы s1 на s2
static void replace_all(std::string& s, const std::string& s1, const std::string& s2)
{
size_t pos = s.find(s1);
while(pos != std::string::npos)
{
s.replace(pos, s1.length(), s2);
pos = s.find(s1, pos + s2.length());
}
}
static std::wstring htmlToXhtml(std::string& sFileContent, bool bNeedConvert)
{
// Распознование кодировки
if (bNeedConvert)
{
size_t posEncoding = sFileContent.find("charset=");
if (posEncoding == std::string::npos)
posEncoding = sFileContent.find("encoding=");
if (posEncoding != std::string::npos)
{
posEncoding = sFileContent.find("=", posEncoding) + 1;
char quoteSymbol = '\"';
if(sFileContent[posEncoding] == '\"' || sFileContent[posEncoding] == '\'')
{
quoteSymbol = sFileContent[posEncoding];
posEncoding += 1;
}
size_t posEnd = sFileContent.find(quoteSymbol, posEncoding);
if (std::string::npos != posEnd)
{
std::string sEncoding = sFileContent.substr(posEncoding, posEnd - posEncoding);
if (sEncoding != "utf-8" && sEncoding != "UTF-8")
{
NSUnicodeConverter::CUnicodeConverter oConverter;
sFileContent = U_TO_UTF8(oConverter.toUnicode(sFileContent, sEncoding.c_str()));
}
}
}
}
// Избавление от <a/>
size_t posA = sFileContent.find("<a ");
while(posA != std::string::npos)
{
size_t nBegin = sFileContent.find('<', posA + 1);
size_t nEnd = sFileContent.find("/>", posA);
if(nEnd < nBegin)
sFileContent.replace(nEnd, 2, "></a>");
posA = sFileContent.find("<a ", nBegin);
}
// Избавление от <title/>
posA = sFileContent.find("<title/>");
while (posA != std::string::npos)
{
sFileContent.replace(posA, 8, "<title></title>");
posA = sFileContent.find("<title/>", posA);
}
// Избавление от <script/>
posA = sFileContent.find("<script");
while (posA != std::string::npos)
{
size_t nEnd = 0;
size_t nEnd1 = sFileContent.find("/>", posA);
size_t nEnd2 = sFileContent.find("</script>", posA);
if (nEnd1 != std::string::npos)
nEnd = nEnd1 + 2;
if (nEnd2 != std::string::npos && (nEnd == 0 || (nEnd > 0 && nEnd2 < nEnd)))
nEnd = nEnd2 + 9;
sFileContent.erase(posA, nEnd - posA);
posA = sFileContent.find("<script", posA);
}
// Gumbo
GumboOptions options = kGumboDefaultOptions;
GumboOutput* output = gumbo_parse_with_options(&options, sFileContent.data(), sFileContent.length());
// prettyprint
NSStringUtils::CStringBuilderA oBuilder;
prettyprint(output->document, oBuilder);
// Конвертирование из string utf8 в wstring
return UTF8_TO_U(oBuilder.GetData());
}
static std::string Base64ToString(const std::string& sContent, const std::string& sCharset)
{
std::string sRes;
int nSrcLen = (int)sContent.length();
int nDecodeLen = NSBase64::Base64DecodeGetRequiredLength(nSrcLen);
BYTE* pData = new BYTE[nDecodeLen];
if (TRUE == NSBase64::Base64Decode(sContent.c_str(), nSrcLen, pData, &nDecodeLen))
{
std::wstring sConvert;
if(!sCharset.empty() && sCharset != "utf-8" && sCharset != "UTF-8")
{
NSUnicodeConverter::CUnicodeConverter oConverter;
sConvert = oConverter.toUnicode(reinterpret_cast<char *>(pData), (unsigned)nDecodeLen, sCharset.data());
}
sRes = sConvert.empty() ? std::string(reinterpret_cast<char *>(pData), nDecodeLen) : U_TO_UTF8(sConvert);
}
RELEASEARRAYOBJECTS(pData);
return sRes;
}
static std::string QuotedPrintableDecode(const std::string& sContent, std::string& sCharset)
{
NSStringUtils::CStringBuilderA sRes;
size_t ip = 0;
size_t i = sContent.find('=');
if(i == 0)
{
size_t nIgnore = 12;
std::string charset = sContent.substr(0, nIgnore);
if(charset == "=00=00=FE=FF")
sCharset = "UTF-32BE";
else if(charset == "=FF=FE=00=00")
sCharset = "UTF-32LE";
else if(charset == "=2B=2F=76=38" || charset == "=2B=2F=76=39" ||
charset == "=2B=2F=76=2B" || charset == "=2B=2F=76=2F")
sCharset = "UTF-7";
else if(charset == "=DD=73=66=73")
sCharset = "UTF-EBCDIC";
else if(charset == "=84=31=95=33")
sCharset = "GB-18030";
else
{
nIgnore -= 3;
charset.erase(nIgnore);
if(charset == "=EF=BB=BF")
sCharset = "UTF-8";
else if(charset == "=F7=64=4C")
sCharset = "UTF-1";
else if(charset == "=0E=FE=FF")
sCharset = "SCSU";
else if(charset == "=FB=EE=28")
sCharset = "BOCU-1";
else
{
nIgnore -= 3;
charset.erase(nIgnore);
if(charset == "=FE=FF")
sCharset = "UTF-16BE";
else if(charset == "=FF=FE")
sCharset = "UTF-16LE";
else
nIgnore -= 6;
}
}
ip = nIgnore;
i = sContent.find('=', ip);
}
while(i != std::string::npos && i + 2 < sContent.length())
{
sRes.WriteString(sContent.c_str() + ip, i - ip);
std::string str = sContent.substr(i + 1, 2);
if(str.front() == '\n' || str.front() == '\r')
{
char ch = str[1];
if(ch != '\n' && ch != '\r')
sRes.WriteString(&ch, 1);
}
else
{
char* err;
char ch = (int)strtol(str.data(), &err, 16);
if(*err)
sRes.WriteString('=' + str);
else
sRes.WriteString(&ch, 1);
}
ip = i + 3;
i = sContent.find('=', ip);
}
if(ip != std::string::npos)
sRes.WriteString(sContent.c_str() + ip);
return sRes.GetData();
}
static void ReadMht(std::string& sFileContent, size_t& nFound, size_t& nNextFound, const std::string& sBoundary,
std::map<std::string, std::string>& sRes, NSStringUtils::CStringBuilderA& oRes)
{
// Content
size_t nContentTag = sFileContent.find("\n\n", nFound);
if(nContentTag == std::string::npos || nContentTag > nNextFound)
{
nContentTag = sFileContent.find("\r\r", nFound);
if(nContentTag == std::string::npos || nContentTag > nNextFound)
{
nContentTag = sFileContent.find("\r\n\r\n", nFound);
if(nContentTag == std::string::npos || nContentTag > nNextFound)
{
nFound = nNextFound;
return;
}
else
nContentTag += 4;
}
else
nContentTag += 2;
}
else
nContentTag += 2;
// Content-Type
size_t nTag = sFileContent.find("Content-Type: ", nFound);
if(nTag == std::string::npos || nTag > nContentTag)
{
nFound = nNextFound;
return;
}
size_t nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
nTag += 14;
if(nTagEnd == std::string::npos || nTagEnd > nContentTag)
{
nFound = nNextFound;
return;
}
std::string sContentType = sFileContent.substr(nTag, nTagEnd - nTag);
if(sContentType == "multipart/alternative")
nContentTag = nFound;
// name
std::string sName;
nTag = sFileContent.find(" name=", nFound);
if(nTag != std::string::npos && nTag < nContentTag)
{
nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
nTag += 6;
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
sName = sFileContent.substr(nTag, nTagEnd - nTag);
}
// charset
std::string sCharset;
nTag = sFileContent.find("charset=", nFound);
if(nTag != std::string::npos && nTag < nContentTag)
{
nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
nTag += 8;
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
{
if(sFileContent[nTag] == '\"')
{
nTag++;
nTagEnd--;
}
sCharset = sFileContent.substr(nTag, nTagEnd - nTag);
}
}
// Content-Location
std::string sContentLocation;
nTag = sFileContent.find("Content-Location: ", nFound);
if(nTag != std::string::npos && nTag < nContentTag)
{
nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
nTag += 18;
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
sContentLocation = sFileContent.substr(nTag, nTagEnd - nTag);
}
if (sContentLocation.empty())
{
// Content-ID
std::string sContentID;
nTag = sFileContent.find("Content-ID: <", nFound);
if(nTag != std::string::npos && nTag < nContentTag)
{
nTagEnd = sFileContent.find_first_of(">", nTag);
nTag += 13;
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
sContentID = sFileContent.substr(nTag, nTagEnd - nTag);
}
if (!sContentID.empty())
sContentLocation = "cid:" + sContentID;
}
// Content-Transfer-Encoding
std::string sContentEncoding;
nTag = sFileContent.find("Content-Transfer-Encoding: ", nFound);
if(nTag != std::string::npos && nTag < nContentTag)
{
nTagEnd = sFileContent.find_first_of(";\n\r", nTag);
nTag += 27;
if(nTagEnd != std::string::npos && nTagEnd < nContentTag)
sContentEncoding = sFileContent.substr(nTag, nTagEnd - nTag);
}
// Content
nTagEnd = nNextFound - 2;
if(nTagEnd == std::string::npos || nTagEnd < nContentTag)
{
nFound = nNextFound;
return;
}
std::string sContent = sFileContent.substr(nContentTag, nTagEnd - nContentTag);
// Удаляем лишнее
sFileContent.erase(0, nNextFound);
nFound = sFileContent.find(sBoundary);
std::wstring sExtention = NSFile::GetFileExtention(UTF8_TO_U(sName));
std::transform(sExtention.begin(), sExtention.end(), sExtention.begin(), tolower);
// Основной документ
if(sContentType == "multipart/alternative")
oRes.WriteString(mhtTohtml(sContent));
else if((sContentType.find("text") != std::string::npos && (sExtention.empty() || sExtention == L"htm" || sExtention == L"html" || sExtention
== L"xhtml" || sExtention == L"css")) || (sContentType == "application/octet-stream" && (sContentLocation.find("css") !=
std::string::npos)))
{
// Стили заключаются в тэг <style>
if(sContentType == "text/css" || sExtention == L"css" || sContentLocation.find("css") != std::string::npos)
oRes.WriteString("<style>");
if(sContentEncoding == "Base64" || sContentEncoding == "base64")
oRes.WriteString(Base64ToString(sContent, sCharset));
else if(sContentEncoding == "8bit" || sContentEncoding == "7bit" || sContentEncoding.empty())
{
if (sCharset != "utf-8" && sCharset != "UTF-8" && !sCharset.empty())
{
NSUnicodeConverter::CUnicodeConverter oConverter;
sContent = U_TO_UTF8(oConverter.toUnicode(sContent, sCharset.data()));
}
oRes.WriteString(sContent);
}
else if(sContentEncoding == "quoted-printable" || sContentEncoding == "Quoted-Printable")
{
sContent = QuotedPrintableDecode(sContent, sCharset);
if (sCharset != "utf-8" && sCharset != "UTF-8" && !sCharset.empty())
{
NSUnicodeConverter::CUnicodeConverter oConverter;
sContent = U_TO_UTF8(oConverter.toUnicode(sContent, sCharset.data()));
}
oRes.WriteString(sContent);
}
if(sContentType == "text/css" || sExtention == L"css" || sContentLocation.find("css") != std::string::npos)
oRes.WriteString("</style>");
}
// Картинки
else if((sContentType.find("image") != std::string::npos || sExtention == L"gif" || sContentType == "application/octet-stream") &&
(sContentEncoding == "Base64" || sContentEncoding == "base64"))
{
if(sExtention == L"ico" || sContentType.find("ico") != std::string::npos)
sContentType = "image/jpg";
else if(sExtention == L"gif")
sContentType = "image/gif";
int nSrcLen = (int)sContent.length();
int nDecodeLen = NSBase64::Base64DecodeGetRequiredLength(nSrcLen);
BYTE* pData = new BYTE[nDecodeLen];
if (TRUE == NSBase64::Base64Decode(sContent.c_str(), nSrcLen, pData, &nDecodeLen))
sRes.insert(std::make_pair(sContentLocation, "data:" + sContentType + ";base64," + sContent));
RELEASEARRAYOBJECTS(pData);
}
}
static std::string mhtTohtml(std::string& sFileContent)
{
std::map<std::string, std::string> sRes;
NSStringUtils::CStringBuilderA oRes;
// Поиск boundary
size_t nFound = sFileContent.find("boundary=");
if(nFound == std::string::npos)
{
size_t nFoundEnd = sFileContent.length();
nFound = 0;
ReadMht(sFileContent, nFound, nFoundEnd, "no", sRes, oRes);
return oRes.GetData();
}
size_t nFoundEnd = sFileContent.find_first_of(";\n\r", nFound);
if(nFoundEnd == std::string::npos)
return "";
nFound += 9;
if(sFileContent[nFound] == '\"')
{
nFound++;
nFoundEnd--;
}
if(nFound > nFoundEnd)
return "";
std::string sBoundary = sFileContent.substr(nFound, nFoundEnd - nFound);
size_t nBoundaryLength = sBoundary.length();
// Удаляем лишнее
nFound = sFileContent.find(sBoundary, nFoundEnd);
sFileContent.erase(0, nFound);
// Цикл по boundary
nFound = 0;
while(nFound != std::string::npos)
{
// Выход по --boundary--
if(sFileContent[nFound + nBoundaryLength + 1] == '-')
break;
nFoundEnd = sFileContent.find(sBoundary, nFound + nBoundaryLength);
if(nFoundEnd == std::string::npos)
break;
ReadMht(sFileContent, nFound, nFoundEnd, sBoundary, sRes, oRes);
}
std::string sFile = oRes.GetData();
for(const std::pair<std::string, std::string>& item : sRes)
{
std::string sName = item.first;
size_t found = sFile.find(sName);
size_t sfound = sName.rfind('/');
if(found == std::string::npos && sfound != std::string::npos)
found = sFile.find(sName.erase(0, sfound + 1));
while(found != std::string::npos)
{
size_t fq = sFile.find_last_of("\"\'>=", found);
char ch = sFile[fq];
if(ch != '\"' && ch != '\'')
fq++;
size_t tq = sFile.find_first_of("\"\'<> ", found) + 1;
if(sFile[tq] != '\"' && sFile[tq] != '\'')
tq--;
if(ch != '>')
{
std::string is = '\"' + item.second + '\"';
sFile.replace(fq, tq - fq, is);
found = sFile.find(sName, fq + is.length());
}
else
found = sFile.find(sName, tq);
}
}
return sFile;
}
static std::wstring mhtToXhtml(std::string& sFileContent)
{
sFileContent = mhtTohtml(sFileContent);
// Gumbo
GumboOptions options = kGumboDefaultOptions;
GumboOutput* output = gumbo_parse_with_options(&options, sFileContent.data(), sFileContent.length());
// prettyprint
NSStringUtils::CStringBuilderA oBuilder;
prettyprint(output->document, oBuilder);
// Конвертирование из string utf8 в wstring
return UTF8_TO_U(oBuilder.GetData());
}
// Заменяет сущности &,<,> в text
static void substitute_xml_entities_into_text(std::string& text)
{
// replacing & must come first
replace_all(text, "&", "&amp;");
replace_all(text, "<", "&lt;");
replace_all(text, ">", "&gt;");
}
// Заменяет сущности " в text
static void substitute_xml_entities_into_attributes(std::string& text)
{
substitute_xml_entities_into_text(text);
replace_all(text, "\"", "&quot;");
}
static std::string handle_unknown_tag(GumboStringPiece* text)
{
if (text->data == NULL)
return "";
GumboStringPiece gsp = *text;
gumbo_tag_from_original_text(&gsp);
std::string sAtr = std::string(gsp.data, gsp.length);
size_t found = sAtr.find_first_of("-'+,./=?;!*#@$_%<>&;\"\'()[]{}");
while(found != std::string::npos)
{
sAtr.erase(found, 1);
found = sAtr.find_first_of("-'+,./=?;!*#@$_%<>&;\"\'()[]{}", found);
}
return sAtr;
}
static std::string get_tag_name(GumboNode* node)
{
std::string tagname = (node->type == GUMBO_NODE_DOCUMENT ? "document" : gumbo_normalized_tagname(node->v.element.tag));
if (tagname.empty())
tagname = handle_unknown_tag(&node->v.element.original_tag);
return tagname;
}
static void build_doctype(GumboNode* node, NSStringUtils::CStringBuilderA& oBuilder)
{
if (node->v.document.has_doctype)
{
oBuilder.WriteString("<!DOCTYPE ");
oBuilder.WriteString(node->v.document.name);
std::string pi(node->v.document.public_identifier);
if ((node->v.document.public_identifier != NULL) && !pi.empty())
{
oBuilder.WriteString(" PUBLIC \"");
oBuilder.WriteString(pi);
oBuilder.WriteString("\" \"");
oBuilder.WriteString(node->v.document.system_identifier);
oBuilder.WriteString("\"");
}
oBuilder.WriteString(">");
}
}
static void build_attributes(const GumboVector* attribs, bool no_entities, NSStringUtils::CStringBuilderA& atts)
{
std::vector<std::string> arrRepeat;
for (size_t i = 0; i < attribs->length; ++i)
{
GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]);
std::string sVal(at->value);
std::string sName(at->name);
atts.WriteString(" ");
bool bCheck = false;
size_t nBad = sName.find_first_of("+,.=?#%<>&;\"\'()[]{}");
while(nBad != std::string::npos)
{
sName.erase(nBad, 1);
nBad = sName.find_first_of("+,.=?#%<>&;\"\'()[]{}", nBad);
if(sName.empty())
break;
bCheck = true;
}
if(sName.empty())
continue;
while(sName.front() >= '0' && sName.front() <= '9')
{
sName.erase(0, 1);
if(sName.empty())
break;
bCheck = true;
}
if(bCheck)
{
GumboAttribute* check = gumbo_get_attribute(attribs, sName.c_str());
if(check || std::find(arrRepeat.begin(), arrRepeat.end(), sName) != arrRepeat.end())
continue;
else
arrRepeat.push_back(sName);
}
if(sName.empty())
continue;
atts.WriteString(sName);
// determine original quote character used if it exists
std::string qs ="\"";
atts.WriteString("=");
atts.WriteString(qs);
if(!no_entities)
substitute_xml_entities_into_attributes(sVal);
atts.WriteString(sVal);
atts.WriteString(qs);
}
}
static void prettyprint_contents(GumboNode* node, NSStringUtils::CStringBuilderA& contents)
{
std::string key = "|" + get_tag_name(node) + "|";
bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos;
bool is_inline = nonbreaking_inline.find(key) != std::string::npos;
bool is_like_inline = treat_like_inline.find(key) != std::string::npos;
GumboVector* children = &node->v.element.children;
for (size_t i = 0; i < children->length; i++)
{
GumboNode* child = static_cast<GumboNode*> (children->data[i]);
if (child->type == GUMBO_NODE_TEXT)
{
std::string val(child->v.text.text);
if(!no_entity_substitution)
substitute_xml_entities_into_text(val);
// Избавление от FF
size_t found = val.find_first_of("\014");
while(found != std::string::npos)
{
val.erase(found, 1);
found = val.find_first_of("\014", found);
}
contents.WriteString(val);
}
else if ((child->type == GUMBO_NODE_ELEMENT) || (child->type == GUMBO_NODE_TEMPLATE))
prettyprint(child, contents);
else if (child->type == GUMBO_NODE_WHITESPACE)
{
if (keep_whitespace || is_inline || is_like_inline)
contents.WriteString(child->v.text.text);
}
else if (child->type != GUMBO_NODE_COMMENT)
{
// Сообщение об ошибке
// Does this actually exist: (child->type == GUMBO_NODE_CDATA)
// fprintf(stderr, "unknown element of type: %d\n", child->type);
}
}
}
static void prettyprint(GumboNode* node, NSStringUtils::CStringBuilderA& oBuilder)
{
// special case the document node
if (node->type == GUMBO_NODE_DOCUMENT)
{
build_doctype(node, oBuilder);
prettyprint_contents(node, oBuilder);
return;
}
std::string close = "";
std::string closeTag = "";
std::string tagname = get_tag_name(node);
std::string key = "|" + tagname + "|";
bool is_empty_tag = empty_tags.find(key) != std::string::npos;
bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
// determine closing tag type
if (is_empty_tag)
close = "/";
else
closeTag = "</" + tagname + ">";
// build results
oBuilder.WriteString("<" + tagname);
// build attr string
const GumboVector* attribs = &node->v.element.attributes;
build_attributes(attribs, no_entity_substitution, oBuilder);
oBuilder.WriteString(close + ">");
// prettyprint your contents
prettyprint_contents(node, oBuilder);
oBuilder.WriteString(closeTag);
}
#endif // HTMLTOXHTML_H