From 7b0f902c0ff702699e16bfac1e568f1dd9c4b971 Mon Sep 17 00:00:00 2001 From: Sergey Konovalov Date: Wed, 23 May 2018 20:32:03 +0300 Subject: [PATCH] [x2t] Turn on DeleteNoUnicode option for bug #37789 --- Common/DocxFormat/Source/XML/Utils.h | 32 +++- DesktopEditor/common/StringBuilder.cpp | 204 +++++++++++++++++-------- DesktopEditor/common/StringBuilder.h | 5 +- 3 files changed, 174 insertions(+), 67 deletions(-) diff --git a/Common/DocxFormat/Source/XML/Utils.h b/Common/DocxFormat/Source/XML/Utils.h index 54d0b52b22..a5ac9159cc 100644 --- a/Common/DocxFormat/Source/XML/Utils.h +++ b/Common/DocxFormat/Source/XML/Utils.h @@ -297,7 +297,7 @@ namespace XmlUtils return result; } - AVSINLINE static std::string EncodeXmlString(const std::string& data, bool bDeleteNoUnicode = false) + AVSINLINE static std::string EncodeXmlString(const std::string& data, bool bDeleteNoUnicode = true) { std::string buffer; buffer.reserve(data.size()); @@ -326,6 +326,14 @@ namespace XmlUtils { buffer.append(&data[pos-1], 2); } + else + { + buffer.append(" "); + } + } + else + { + buffer.append(" "); } } else @@ -354,7 +362,7 @@ namespace XmlUtils return buffer; } - AVSINLINE static std::wstring EncodeXmlString(const std::wstring& data, bool bDeleteNoUnicode = false) + AVSINLINE static std::wstring EncodeXmlString(const std::wstring& data, bool bDeleteNoUnicode = true) { std::wstring buffer; buffer.reserve(data.size()); @@ -383,6 +391,14 @@ namespace XmlUtils { buffer.append(&data[pos-1], 2); } + else + { + buffer.append(L" "); + } + } + else + { + buffer.append(L" "); } } else @@ -411,7 +427,7 @@ namespace XmlUtils return buffer; } - AVSINLINE static std::wstring EncodeXmlStringExtend(const std::wstring& data, bool bDeleteNoUnicode = false) + AVSINLINE static std::wstring EncodeXmlStringExtend(const std::wstring& data, bool bDeleteNoUnicode = true) { std::wstring buffer; buffer.reserve(data.size()); @@ -430,7 +446,6 @@ namespace XmlUtils case '\n': buffer.append(L" "); break; case '\r': buffer.append(L" "); break; case '\t': buffer.append(L" "); break; - case 160: buffer.append(L" "); break; default: { if ( false == IsUnicodeSymbol( data[pos] ) ) @@ -444,6 +459,14 @@ namespace XmlUtils { buffer.append(&data[pos-1], 2); } + else + { + buffer.append(L" "); + } + } + else + { + buffer.append(L" "); } } else @@ -466,7 +489,6 @@ namespace XmlUtils case '\n': buffer.append(L" "); break; case '\r': buffer.append(L" "); break; case '\t': buffer.append(L" "); break; - case 160: buffer.append(L" "); break; case '\0': return buffer; default: buffer.append(&data[pos], 1); break; diff --git a/DesktopEditor/common/StringBuilder.cpp b/DesktopEditor/common/StringBuilder.cpp index 892df83f33..9d5b9aec8a 100644 --- a/DesktopEditor/common/StringBuilder.cpp +++ b/DesktopEditor/common/StringBuilder.cpp @@ -295,79 +295,151 @@ namespace NSStringUtils } void CStringBuilder::WriteEncodeXmlString(const wchar_t* pString, int nCount) + { + if (sizeof(wchar_t) == 2) + WriteEncodeXmlString_2bytes(pString, nCount); + else + WriteEncodeXmlString_4bytes(pString, nCount); + } + inline void CStringBuilder::WriteEncodeXmlString_4bytes(const wchar_t* pString, int nCount) { const wchar_t* pData = pString; int nCounter = 0; + unsigned int code; while (*pData != 0) { - BYTE _code = CheckCode(*pData); - - switch (_code) - { - case 1: - AddCharSafe(*pData); - break; - case 0: - AddCharSafe((wchar_t)' '); - break; - case 2: - AddSize(5); - *m_pDataCur++ = (wchar_t)('&'); - *m_pDataCur++ = (wchar_t)('a'); - *m_pDataCur++ = (wchar_t)('m'); - *m_pDataCur++ = (wchar_t)('p'); - *m_pDataCur++ = (wchar_t)(';'); - m_lSizeCur += 5; - break; - case 3: - AddSize(6); - *m_pDataCur++ = (wchar_t)('&'); - *m_pDataCur++ = (wchar_t)('a'); - *m_pDataCur++ = (wchar_t)('p'); - *m_pDataCur++ = (wchar_t)('o'); - *m_pDataCur++ = (wchar_t)('s'); - *m_pDataCur++ = (wchar_t)(';'); - m_lSizeCur += 6; - break; - case 4: - AddSize(4); - *m_pDataCur++ = (wchar_t)('&'); - *m_pDataCur++ = (wchar_t)('l'); - *m_pDataCur++ = (wchar_t)('t'); - *m_pDataCur++ = (wchar_t)(';'); - m_lSizeCur += 4; - break; - case 5: - AddSize(4); - *m_pDataCur++ = (wchar_t)('&'); - *m_pDataCur++ = (wchar_t)('g'); - *m_pDataCur++ = (wchar_t)('t'); - *m_pDataCur++ = (wchar_t)(';'); - m_lSizeCur += 4; - break; - case 6: - AddSize(6); - *m_pDataCur++ = (wchar_t)('&'); - *m_pDataCur++ = (wchar_t)('q'); - *m_pDataCur++ = (wchar_t)('u'); - *m_pDataCur++ = (wchar_t)('o'); - *m_pDataCur++ = (wchar_t)('t'); - *m_pDataCur++ = (wchar_t)(';'); - m_lSizeCur += 6; - break; - default: - break; - } + code = (unsigned int)*pData; + WriteEncodeXmlChar(*pData, CheckXmlCode(code)); ++pData; if (-1 != nCount) { ++nCounter; - if (nCounter == nCount) + if (nCounter >= nCount) break; } } } + inline void CStringBuilder::WriteEncodeXmlString_2bytes(const wchar_t* pString, int nCount) + { + const wchar_t* pData = pString; + int nCounter = 0; + unsigned int code; + BYTE type; + while (*pData != 0) + { + code = (unsigned int)*pData; + if (code >= 0xD800 && code <= 0xDFFF && *(pData + 1) != 0) + { + code = 0x10000 + (((code & 0x3FF) << 10) | (0x03FF & *(pData + 1))); + type = CheckXmlCode(code); + if(0 != type) + { + WriteEncodeXmlChar(*pData, type); + ++pData; + } + } + else + { + type = CheckXmlCode(code); + } + WriteEncodeXmlChar(*pData, type); + + ++pData; + if (-1 != nCount) + { + ++nCounter; + if (nCounter >= nCount) + break; + } + } + } + inline void CStringBuilder::WriteEncodeXmlChar(wchar_t code, BYTE type) + { + switch (type) + { + case 1: + AddCharSafe(code); + break; + case 0: + AddCharSafe((wchar_t)' '); + break; + case 2: + AddSize(5); + *m_pDataCur++ = (wchar_t)('&'); + *m_pDataCur++ = (wchar_t)('a'); + *m_pDataCur++ = (wchar_t)('m'); + *m_pDataCur++ = (wchar_t)('p'); + *m_pDataCur++ = (wchar_t)(';'); + m_lSizeCur += 5; + break; + case 3: + AddSize(6); + *m_pDataCur++ = (wchar_t)('&'); + *m_pDataCur++ = (wchar_t)('a'); + *m_pDataCur++ = (wchar_t)('p'); + *m_pDataCur++ = (wchar_t)('o'); + *m_pDataCur++ = (wchar_t)('s'); + *m_pDataCur++ = (wchar_t)(';'); + m_lSizeCur += 6; + break; + case 4: + AddSize(4); + *m_pDataCur++ = (wchar_t)('&'); + *m_pDataCur++ = (wchar_t)('l'); + *m_pDataCur++ = (wchar_t)('t'); + *m_pDataCur++ = (wchar_t)(';'); + m_lSizeCur += 4; + break; + case 5: + AddSize(4); + *m_pDataCur++ = (wchar_t)('&'); + *m_pDataCur++ = (wchar_t)('g'); + *m_pDataCur++ = (wchar_t)('t'); + *m_pDataCur++ = (wchar_t)(';'); + m_lSizeCur += 4; + break; + case 6: + AddSize(6); + *m_pDataCur++ = (wchar_t)('&'); + *m_pDataCur++ = (wchar_t)('q'); + *m_pDataCur++ = (wchar_t)('u'); + *m_pDataCur++ = (wchar_t)('o'); + *m_pDataCur++ = (wchar_t)('t'); + *m_pDataCur++ = (wchar_t)(';'); + m_lSizeCur += 6; + break; + case 7: + AddSize(5); + *m_pDataCur++ = (wchar_t)('&'); + *m_pDataCur++ = (wchar_t)('#'); + *m_pDataCur++ = (wchar_t)('x'); + *m_pDataCur++ = (wchar_t)('A'); + *m_pDataCur++ = (wchar_t)(';'); + m_lSizeCur += 5; + break; + case 8: + AddSize(5); + *m_pDataCur++ = (wchar_t)('&'); + *m_pDataCur++ = (wchar_t)('#'); + *m_pDataCur++ = (wchar_t)('x'); + *m_pDataCur++ = (wchar_t)('D'); + *m_pDataCur++ = (wchar_t)(';'); + m_lSizeCur += 5; + break; + case 9: + AddSize(5); + *m_pDataCur++ = (wchar_t)('&'); + *m_pDataCur++ = (wchar_t)('#'); + *m_pDataCur++ = (wchar_t)('x'); + *m_pDataCur++ = (wchar_t)('9'); + *m_pDataCur++ = (wchar_t)(';'); + m_lSizeCur += 5; + break; + default: + break; + } + } size_t CStringBuilder::GetCurSize() { @@ -654,7 +726,7 @@ namespace NSStringUtils WriteHexByteNoSafe((value >> 16) & 0xFF); } - unsigned char CStringBuilder::CheckCode(const wchar_t& c) + unsigned char CStringBuilder::CheckXmlCode(unsigned int c) { if ('&' == c) return 2; @@ -666,8 +738,18 @@ namespace NSStringUtils return 5; if ('\"' == c) return 6; + if ('\n' == c)//when reading from the attributes is replaced by a space. + return 7; + if ('\r' == c)//when reading from the attributes is replaced by a space. + return 8; + if ('\t' == c)//when reading from the attributes is replaced by a space. + return 9; - return 1; + //xml 1.0 Character Range https://www.w3.org/TR/xml/#charsets + if ((0x20 <= c && c <= 0xD7FF) || (0xE000 <= c && c <= 0xFFFD) || (0x10000 <= c && c <= 0x10FFFF)) + return 1; + + return 0; } void string_replace(std::wstring& text, const std::wstring& replaceFrom, const std::wstring& replaceTo) diff --git a/DesktopEditor/common/StringBuilder.h b/DesktopEditor/common/StringBuilder.h index 3e48ff749f..3cdc378b2d 100644 --- a/DesktopEditor/common/StringBuilder.h +++ b/DesktopEditor/common/StringBuilder.h @@ -145,7 +145,10 @@ namespace NSStringUtils void WriteHexColor3(const unsigned int& value); protected: - unsigned char CheckCode(const wchar_t& c); + inline void WriteEncodeXmlString_4bytes(const wchar_t* pString, int nCount); + inline void WriteEncodeXmlString_2bytes(const wchar_t* pString, int nCount); + inline void WriteEncodeXmlChar(wchar_t code, unsigned char type); + inline unsigned char CheckXmlCode(unsigned int c); }; KERNEL_DECL void string_replace(std::wstring& text, const std::wstring& replaceFrom, const std::wstring& replaceTo);