diff --git a/Common/DocxFormat/Source/XlsxFormat/SharedStrings/Text.h b/Common/DocxFormat/Source/XlsxFormat/SharedStrings/Text.h index af318bea6b..f6e1761451 100644 --- a/Common/DocxFormat/Source/XlsxFormat/SharedStrings/Text.h +++ b/Common/DocxFormat/Source/XlsxFormat/SharedStrings/Text.h @@ -61,7 +61,7 @@ namespace OOX if(std::wstring::npos != m_sText.find(' ') || std::wstring::npos != m_sText.find('\n')) writer.WriteString(_T(" xml:space=\"preserve\"")); writer.WriteString(_T(">")); - writer.WriteEncodeXmlString(m_sText); + writer.WriteEncodeXmlStringHHHH(m_sText); writer.WriteString(_T("")); } virtual void toXML2(NSStringUtils::CStringBuilder& writer, const wchar_t* name) const @@ -71,7 +71,7 @@ namespace OOX if(std::wstring::npos != m_sText.find(' ') || std::wstring::npos != m_sText.find('\n')) writer.WriteString(_T(" xml:space=\"preserve\"")); writer.WriteString(_T(">")); - writer.WriteEncodeXmlString(m_sText); + writer.WriteEncodeXmlStringHHHH(m_sText); writer.WriteString(_T("")); @@ -83,7 +83,21 @@ namespace OOX if ( oReader.IsEmptyNode() ) return; - m_sText = oReader.GetText3(); + int nDepth = oReader.GetDepth(); + XmlUtils::XmlNodeType eNodeType = XmlUtils::XmlNodeType_EndElement; + while (oReader.Read(eNodeType) && oReader.GetDepth() >= nDepth && XmlUtils::XmlNodeType_EndElement != eNodeType) + { + if (eNodeType == XmlUtils::XmlNodeType_Text || eNodeType == XmlUtils::XmlNodeType_Whitespace || eNodeType == XmlUtils::XmlNodeType_SIGNIFICANT_WHITESPACE) + { + std::string sTemp = oReader.GetTextA(); + wchar_t* pUnicodes = NULL; + LONG lOutputCount = 0; + NSFile::CUtf8Converter::GetUnicodeStringFromUTF8WithHHHH((BYTE*)sTemp.c_str(), sTemp.length(), pUnicodes, lOutputCount); + m_sText.append(pUnicodes); + RELEASEARRAYOBJECTS(pUnicodes); + } + } + NSStringExt::Replace(m_sText, L"\t", L""); if(!(m_oSpace.IsInit() && SimpleTypes::xmlspacePreserve == m_oSpace->GetValue())) { diff --git a/DesktopEditor/common/File.cpp b/DesktopEditor/common/File.cpp index 7aa6745498..657cd46c48 100644 --- a/DesktopEditor/common/File.cpp +++ b/DesktopEditor/common/File.cpp @@ -121,6 +121,10 @@ namespace NSFile std::wstring CUtf8Converter::GetUnicodeFromCharPtr(const std::string& sParam, INT bIsUtf8) { return GetUnicodeFromCharPtr(sParam.c_str(), (LONG)sParam.length(), bIsUtf8); + } + LONG CUtf8Converter::GetUnicodeStringFromUTF8BufferSize(LONG lCount) + { + return lCount + 1; } std::wstring CUtf8Converter::GetUnicodeStringFromUTF8_4bytes( BYTE* pBuffer, LONG lCount ) { @@ -305,6 +309,249 @@ namespace NSFile return GetUnicodeStringFromUTF8_4bytes(pBuffer, lCount); } +#define CHECK_HHHH(pBuffer) \ + wchar_t code = 0; \ + if('_' == pBuffer[0] && 'x' == pBuffer[1] && 0 != pBuffer[2] && 0 != pBuffer[3] && 0 != pBuffer[4] && 0 != pBuffer[5] && '_' == pBuffer[6]) \ + { \ + int i = 2; \ + for(; i < 6; ++i) \ + { \ + code *= 16; \ + if('0' <= pBuffer[i] && pBuffer[i] <= '9') \ + { \ + code += pBuffer[i] - '0'; \ + } \ + else if('A' <= pBuffer[i] && pBuffer[i] <= 'F') \ + { \ + code += pBuffer[i] - 'A' + 10; \ + } \ + else if('a' <= pBuffer[i] && pBuffer[i] <= 'f') \ + { \ + code += pBuffer[i] - 'a' + 10; \ + } \ + else \ + { \ + break; \ + } \ + } \ + if(i == 6) \ + { \ + if(0x005F == code) \ + { \ + code = '_'; \ + } \ + return code; \ + } \ + } \ + return -1; + + long CUtf8Converter::CheckHHHHChar(const BYTE* pBuffer) + { + CHECK_HHHH(pBuffer); + } + long CUtf8Converter::CheckHHHHChar(const wchar_t* pBuffer) + { + CHECK_HHHH(pBuffer); + } + + void CUtf8Converter::GetUnicodeStringFromUTF8WithHHHH_4bytes( const BYTE* pBuffer, LONG lCount, wchar_t*& pUnicodes, LONG& lOutputCount ) + { + if (NULL == pUnicodes) + { + pUnicodes = new wchar_t[GetUnicodeStringFromUTF8BufferSize(lCount)]; + } + WCHAR* pUnicodeString = pUnicodes; + LONG lIndexUnicode = 0; + + LONG lIndex = 0; + while (lIndex < lCount) + { + BYTE byteMain = pBuffer[lIndex]; + if (0x00 == (byteMain & 0x80)) + { + // 1 byte + long code = CheckHHHHChar(pBuffer + lIndex); + if(code < 0) + { + pUnicodeString[lIndexUnicode++] = (WCHAR)byteMain; + ++lIndex; + } + else + { + pUnicodeString[lIndexUnicode++] = (WCHAR)code; + lIndex += 7; + } + } + else if (0x00 == (byteMain & 0x20)) + { + // 2 byte + int val = (int)(((byteMain & 0x1F) << 6) | + (pBuffer[lIndex + 1] & 0x3F)); + pUnicodeString[lIndexUnicode++] = (WCHAR)(val); + lIndex += 2; + } + else if (0x00 == (byteMain & 0x10)) + { + // 3 byte + int val = (int)(((byteMain & 0x0F) << 12) | + ((pBuffer[lIndex + 1] & 0x3F) << 6) | + (pBuffer[lIndex + 2] & 0x3F)); + pUnicodeString[lIndexUnicode++] = (WCHAR)(val); + lIndex += 3; + } + else if (0x00 == (byteMain & 0x0F)) + { + // 4 byte + int val = (int)(((byteMain & 0x07) << 18) | + ((pBuffer[lIndex + 1] & 0x3F) << 12) | + ((pBuffer[lIndex + 2] & 0x3F) << 6) | + (pBuffer[lIndex + 3] & 0x3F)); + pUnicodeString[lIndexUnicode++] = (WCHAR)(val); + lIndex += 4; + } + else if (0x00 == (byteMain & 0x08)) + { + // 4 byte + int val = (int)(((byteMain & 0x07) << 18) | + ((pBuffer[lIndex + 1] & 0x3F) << 12) | + ((pBuffer[lIndex + 2] & 0x3F) << 6) | + (pBuffer[lIndex + 3] & 0x3F)); + pUnicodeString[lIndexUnicode++] = (WCHAR)(val); + lIndex += 4; + } + else if (0x00 == (byteMain & 0x04)) + { + // 5 byte + int val = (int)(((byteMain & 0x03) << 24) | + ((pBuffer[lIndex + 1] & 0x3F) << 18) | + ((pBuffer[lIndex + 2] & 0x3F) << 12) | + ((pBuffer[lIndex + 3] & 0x3F) << 6) | + (pBuffer[lIndex + 4] & 0x3F)); + pUnicodeString[lIndexUnicode++] = (WCHAR)(val); + lIndex += 5; + } + else + { + // 6 byte + int val = (int)(((byteMain & 0x01) << 30) | + ((pBuffer[lIndex + 1] & 0x3F) << 24) | + ((pBuffer[lIndex + 2] & 0x3F) << 18) | + ((pBuffer[lIndex + 3] & 0x3F) << 12) | + ((pBuffer[lIndex + 4] & 0x3F) << 6) | + (pBuffer[lIndex + 5] & 0x3F)); + pUnicodeString[lIndexUnicode++] = (WCHAR)(val); + lIndex += 5; + } + } + + pUnicodeString[lIndexUnicode] = 0; + lOutputCount = lIndexUnicode; + } + void CUtf8Converter::GetUnicodeStringFromUTF8WithHHHH_2bytes( const BYTE* pBuffer, LONG lCount, wchar_t*& pUnicodes, LONG& lOutputCount ) + { + if (NULL == pUnicodes) + { + pUnicodes = new wchar_t[GetUnicodeStringFromUTF8BufferSize(lCount)]; + } + WCHAR* pUnicodeString = pUnicodes; + WCHAR* pStart = pUnicodeString; + LONG lIndex = 0; + while (lIndex < lCount) + { + BYTE byteMain = pBuffer[lIndex]; + if (0x00 == (byteMain & 0x80)) + { + // 1 byte + long code = CheckHHHHChar(pBuffer + lIndex); + if(code < 0) + { + *pUnicodeString++ = (WCHAR)byteMain; + ++lIndex; + } + else + { + *pUnicodeString++ = (WCHAR)code; + lIndex += 7; + } + + } + else if (0x00 == (byteMain & 0x20)) + { + // 2 byte + int val = (int)(((byteMain & 0x1F) << 6) | + (pBuffer[lIndex + 1] & 0x3F)); + *pUnicodeString++ = (WCHAR)(val); + lIndex += 2; + } + else if (0x00 == (byteMain & 0x10)) + { + // 3 byte + int val = (int)(((byteMain & 0x0F) << 12) | + ((pBuffer[lIndex + 1] & 0x3F) << 6) | + (pBuffer[lIndex + 2] & 0x3F)); + + WriteUtf16_WCHAR(val, pUnicodeString); + lIndex += 3; + } + else if (0x00 == (byteMain & 0x0F)) + { + // 4 byte + int val = (int)(((byteMain & 0x07) << 18) | + ((pBuffer[lIndex + 1] & 0x3F) << 12) | + ((pBuffer[lIndex + 2] & 0x3F) << 6) | + (pBuffer[lIndex + 3] & 0x3F)); + + WriteUtf16_WCHAR(val, pUnicodeString); + lIndex += 4; + } + else if (0x00 == (byteMain & 0x08)) + { + // 4 byte + int val = (int)(((byteMain & 0x07) << 18) | + ((pBuffer[lIndex + 1] & 0x3F) << 12) | + ((pBuffer[lIndex + 2] & 0x3F) << 6) | + (pBuffer[lIndex + 3] & 0x3F)); + + WriteUtf16_WCHAR(val, pUnicodeString); + lIndex += 4; + } + else if (0x00 == (byteMain & 0x04)) + { + // 5 byte + int val = (int)(((byteMain & 0x03) << 24) | + ((pBuffer[lIndex + 1] & 0x3F) << 18) | + ((pBuffer[lIndex + 2] & 0x3F) << 12) | + ((pBuffer[lIndex + 3] & 0x3F) << 6) | + (pBuffer[lIndex + 4] & 0x3F)); + + WriteUtf16_WCHAR(val, pUnicodeString); + lIndex += 5; + } + else + { + // 6 byte + int val = (int)(((byteMain & 0x01) << 30) | + ((pBuffer[lIndex + 1] & 0x3F) << 24) | + ((pBuffer[lIndex + 2] & 0x3F) << 18) | + ((pBuffer[lIndex + 3] & 0x3F) << 12) | + ((pBuffer[lIndex + 4] & 0x3F) << 6) | + (pBuffer[lIndex + 5] & 0x3F)); + + WriteUtf16_WCHAR(val, pUnicodeString); + lIndex += 5; + } + } + + *pUnicodeString++ = 0; + lOutputCount = pUnicodeString - pStart; + } + void CUtf8Converter::GetUnicodeStringFromUTF8WithHHHH( const BYTE* pBuffer, LONG lCount, wchar_t*& pUnicodes, LONG& lOutputCount ) + { + if (sizeof(WCHAR) == 2) + return GetUnicodeStringFromUTF8WithHHHH_2bytes(pBuffer, lCount, pUnicodes, lOutputCount); + return GetUnicodeStringFromUTF8WithHHHH_4bytes(pBuffer, lCount, pUnicodes, lOutputCount); + } + void CUtf8Converter::GetUtf8StringFromUnicode_4bytes(const wchar_t* pUnicodes, LONG lCount, BYTE*& pData, LONG& lOutputCount, bool bIsBOM) { if (NULL == pData) diff --git a/DesktopEditor/common/File.h b/DesktopEditor/common/File.h index f8b8aa7f5b..c5993b7189 100644 --- a/DesktopEditor/common/File.h +++ b/DesktopEditor/common/File.h @@ -100,11 +100,17 @@ namespace NSFile static std::wstring GetUnicodeFromCharPtr(const char* pData, LONG lCount, INT bIsUtf8 = FALSE); static std::wstring GetUnicodeFromCharPtr(const std::string& sParam, INT bIsUtf8 = FALSE); + static std::wstring GetUnicodeStringFromUTF8_4bytes( BYTE* pBuffer, LONG lCount ); static std::wstring GetUnicodeStringFromUTF8_2bytes( BYTE* pBuffer, LONG lCount ); - static std::wstring GetUnicodeStringFromUTF8( BYTE* pBuffer, LONG lCount ); + static void GetUnicodeStringFromUTF8WithHHHH_4bytes( const BYTE* pBuffer, LONG lCount, wchar_t*& pUnicodes, LONG& lOutputCount ); + static void GetUnicodeStringFromUTF8WithHHHH_2bytes( const BYTE* pBuffer, LONG lCount, wchar_t*& pUnicodes, LONG& lOutputCount ); + static void GetUnicodeStringFromUTF8WithHHHH( const BYTE* pBuffer, LONG lCount, wchar_t*& pUnicodes, LONG& lOutputCount ); + + static inline LONG GetUnicodeStringFromUTF8BufferSize( LONG lCount ); + static void GetUtf8StringFromUnicode_4bytes(const wchar_t* pUnicodes, LONG lCount, BYTE*& pData, LONG& lOutputCount, bool bIsBOM = false); static void GetUtf8StringFromUnicode_2bytes(const wchar_t* pUnicodes, LONG lCount, BYTE*& pData, LONG& lOutputCount, bool bIsBOM = false); static void GetUtf8StringFromUnicode(const wchar_t* pUnicodes, LONG lCount, BYTE*& pData, LONG& lOutputCount, bool bIsBOM = false); @@ -118,6 +124,9 @@ namespace NSFile static std::wstring GetWStringFromUTF16(const CStringUtf16& data); static std::wstring GetWStringFromUTF16(const unsigned short* pUtf16, LONG lCount); + + static inline long CheckHHHHChar(const BYTE* pBuffer); + static inline long CheckHHHHChar(const wchar_t* pBuffer); }; class KERNEL_DECL CFileBinary diff --git a/DesktopEditor/common/StringBuilder.cpp b/DesktopEditor/common/StringBuilder.cpp index 4a6aeef1a2..e098f933eb 100644 --- a/DesktopEditor/common/StringBuilder.cpp +++ b/DesktopEditor/common/StringBuilder.cpp @@ -30,6 +30,7 @@ * */ #include "StringBuilder.h" +#include "File.h" namespace NSStringUtils { @@ -301,6 +302,18 @@ namespace NSStringUtils else WriteEncodeXmlString_4bytes(pString, nCount); } + void CStringBuilder::WriteEncodeXmlStringHHHH(const std::wstring& sString) + { + WriteEncodeXmlStringHHHH(sString.c_str(), (int)sString.length()); + } + + void CStringBuilder::WriteEncodeXmlStringHHHH(const wchar_t* pString, int nCount) + { + if (sizeof(wchar_t) == 2) + WriteEncodeXmlStringHHHH_2bytes(pString, nCount); + else + WriteEncodeXmlStringHHHH_4bytes(pString, nCount); + } inline void CStringBuilder::WriteEncodeXmlString_4bytes(const wchar_t* pString, int nCount) { const wchar_t* pData = pString; @@ -354,6 +367,59 @@ namespace NSStringUtils } } } + inline void CStringBuilder::WriteEncodeXmlStringHHHH_4bytes(const wchar_t* pString, int nCount) + { + const wchar_t* pData = pString; + int nCounter = 0; + unsigned int code; + while (*pData != 0) + { + code = (unsigned int)*pData; + WriteEncodeXmlChar(*pData, CheckXmlCodeHHHH(code, pData)); + + ++pData; + if (-1 != nCount) + { + ++nCounter; + if (nCounter >= nCount) + break; + } + } + } + inline void CStringBuilder::WriteEncodeXmlStringHHHH_2bytes(const wchar_t* pString, int nCount) + { + const wchar_t* pData = pString; + int nCounter = 0; + unsigned int code; + BYTE type; + while (*pData != 0) + { + code = (unsigned int)*pData; + if (code >= 0xD800 && code <= 0xDFFF && *(pData + 1) != 0) + { + code = 0x10000 + (((code & 0x3FF) << 10) | (0x03FF & *(pData + 1))); + type = CheckXmlCodeHHHH(code, pData); + if(0 != type) + { + WriteEncodeXmlChar(*pData, type); + ++pData; + } + } + else + { + type = CheckXmlCodeHHHH(code, pData); + } + WriteEncodeXmlChar(*pData, type); + + ++pData; + if (-1 != nCount) + { + ++nCounter; + if (nCounter >= nCount) + break; + } + } + } inline void CStringBuilder::WriteEncodeXmlChar(wchar_t code, BYTE type) { switch (type) @@ -436,6 +502,27 @@ namespace NSStringUtils *m_pDataCur++ = (wchar_t)(';'); m_lSizeCur += 5; break; + case 10: + AddSize(7); + *m_pDataCur++ = (wchar_t)('_'); + *m_pDataCur++ = (wchar_t)('x'); + *m_pDataCur++ = (wchar_t)('0'); + *m_pDataCur++ = (wchar_t)('0'); + *m_pDataCur++ = (wchar_t)('5'); + *m_pDataCur++ = (wchar_t)('F'); + *m_pDataCur++ = (wchar_t)('_'); + m_lSizeCur += 7; + break; + case 11: + AddSize(7); + *m_pDataCur++ = (wchar_t)('_'); + *m_pDataCur++ = (wchar_t)('x'); + m_lSizeCur += 2; + WriteHexByteNoSafe((code >> 8) & 0xFF); + WriteHexByteNoSafe(code & 0xFF); + *m_pDataCur++ = (wchar_t)('_'); + ++m_lSizeCur; + break; default: break; } @@ -751,6 +838,35 @@ namespace NSStringUtils return 0; } + unsigned char CStringBuilder::CheckXmlCodeHHHH(unsigned int c, const wchar_t* pData) + { + if ('&' == c) + return 2; + if ('\'' == c) + return 3; + if ('<' == c) + return 4; + if ('>' == c) + return 5; + if ('\"' == c) + return 6; + if ('\n' == c)//when reading from the attributes is replaced by a space. + return 7; + if ('\r' == c)//when reading from the attributes is replaced by a space. + return 8; + if ('\t' == c)//when reading from the attributes is replaced by a space. + return 9; + if (NSFile::CUtf8Converter::CheckHHHHChar(pData) >= 0) + return 10; + + //xml 1.0 Character Range https://www.w3.org/TR/xml/#charsets + if ((0x20 <= c && c <= 0xD7FF) || (0xE000 <= c && c <= 0xFFFD) || (0x10000 <= c && c <= 0x10FFFF)) + return 1; + else if(c <= 0xFFFF) + return 11; + + return 0; + } void string_replace(std::wstring& text, const std::wstring& replaceFrom, const std::wstring& replaceTo) { diff --git a/DesktopEditor/common/StringBuilder.h b/DesktopEditor/common/StringBuilder.h index 43bddc38fa..c45275ce4c 100644 --- a/DesktopEditor/common/StringBuilder.h +++ b/DesktopEditor/common/StringBuilder.h @@ -109,9 +109,11 @@ namespace NSStringUtils void AddChar2Safe(const wchar_t _c1, const wchar_t& _c2); void WriteEncodeXmlString(const std::wstring& sString); - void WriteEncodeXmlString(const wchar_t* pString, int nCount = -1); + void WriteEncodeXmlStringHHHH(const std::wstring& sString); + void WriteEncodeXmlStringHHHH(const wchar_t* pString, int nCount = -1); + size_t GetCurSize(); void SetCurSize(size_t lCurSize); size_t GetSize(); @@ -147,8 +149,11 @@ namespace NSStringUtils protected: inline void WriteEncodeXmlString_4bytes(const wchar_t* pString, int nCount); inline void WriteEncodeXmlString_2bytes(const wchar_t* pString, int nCount); + inline void WriteEncodeXmlStringHHHH_4bytes(const wchar_t* pString, int nCount); + inline void WriteEncodeXmlStringHHHH_2bytes(const wchar_t* pString, int nCount); inline void WriteEncodeXmlChar(wchar_t code, unsigned char type); inline unsigned char CheckXmlCode(unsigned int c); + inline unsigned char CheckXmlCodeHHHH(unsigned int c, const wchar_t* pData); }; KERNEL_DECL void string_replace(std::wstring& text, const std::wstring& replaceFrom, const std::wstring& replaceTo);