Add method for work with xml with different BOM

This commit is contained in:
Oleg Korshul
2022-03-31 18:04:22 +03:00
parent 7f0691ee09
commit 73845ec7fd
2 changed files with 92 additions and 0 deletions

View File

@ -335,6 +335,9 @@ namespace XmlUtils
std::string KERNEL_DECL Execute(const std::string& sXml, int mode = XML_C14N_1_0, bool withComments = false);
std::string KERNEL_DECL Execute(const std::wstring& sXmlFile, int mode = XML_C14N_1_0, bool withComments = false);
}
// UTF-8 BOM, UTF-16BE BOM, UTF-16LE BOM, UTF-32BE BOM, UTF-32LE BOM
std::string KERNEL_DECL GetUtf8FromFileContent(unsigned char* pData, unsigned int len);
}
#endif // _BUILD_XMLUTILS_CROSSPLATFORM_H_

View File

@ -1067,3 +1067,92 @@ namespace XmlUtils
return Execute(sXml, mode, withComments);
}
}
std::string XmlUtils::GetUtf8FromFileContent(unsigned char* pData, unsigned int len)
{
if (4 > len)
return std::string((char*)pData, (size_t)len);
if (pData[0] == 0xEF && pData[1] == 0xBB && pData[2] == 0xFE && pData[3] == 0xBF)
{
return std::string((char*)pData + 3, (size_t)(len - 3));
}
char markerUtf16 = 0;
if (pData[0] == 0xFF && pData[1] == 0xFE && !(pData[2] == 0x00 && pData[3] == 0x00))
markerUtf16 = 1;
if (pData[0] == 0xFE && pData[1] == 0xFF)
markerUtf16 = 2;
if (0 != markerUtf16)
{
int nCountSymbols = (len - 2) >> 1;
int nCountSymbolsNatural = 0;
unsigned int* pUnicodes = new unsigned int[nCountSymbols];
unsigned char* pCurrent = pData + 2;
for (int i = 0; i < nCountSymbols; ++i)
{
unsigned short nLeading = (markerUtf16 == 1) ? (pCurrent[0] | (pCurrent[1] << 8)) : (pCurrent[1] | (pCurrent[0] << 8));
pCurrent += 2;
if (nLeading < 0xD800 || nLeading > 0xDFFF)
{
pUnicodes[nCountSymbolsNatural++] = nLeading;
}
else
{
i++;
if (i == nCountSymbols)
break;
unsigned short nTrailing = (markerUtf16 == 1) ? (pCurrent[0] | (pCurrent[1] << 8)) : (pCurrent[1] | (pCurrent[0] << 8));
pCurrent += 2;
if (nTrailing >= 0xDC00 && nTrailing <= 0xDFFF)
{
pUnicodes[nCountSymbolsNatural++] = 0x10000 + (((nLeading & 0x03FF) << 10) | (nTrailing & 0x03FF));
}
}
}
std::string sRet = NSStringExt::CConverter::GetUtf8FromUTF32(pUnicodes, nCountSymbolsNatural);
RELEASEARRAYOBJECTS(pUnicodes);
return sRet;
}
char markerUtf32 = 0;
if (pData[0] == 0xFF && pData[1] == 0xFE && pData[2] == 0x00 && pData[3] == 0x00)
markerUtf32 = 1;
if (pData[0] == 0 && pData[1] == 0 && pData[2] == 0xFE && pData[3] == 0xFF)
markerUtf32 = 2;
if (0 != markerUtf16)
{
int nCountSymbols = (len - 4) >> 2;
int nCountSymbolsNatural = 0;
unsigned int* pUnicodes = new unsigned int[nCountSymbols];
unsigned char* pCurrent = pData + 4;
if (markerUtf32 == 1)
{
for (int i = 0; i < nCountSymbols; ++i, pCurrent += 4)
pUnicodes[nCountSymbolsNatural++] = (pCurrent[0] | (pCurrent[1] << 8) | (pCurrent[2] << 16) | (pCurrent[3] << 24));
}
else
{
for (int i = 0; i < nCountSymbols; ++i, pCurrent += 4)
pUnicodes[nCountSymbolsNatural++] = (pCurrent[3] | (pCurrent[2] << 8) | (pCurrent[1] << 16) | (pCurrent[0] << 24));
}
std::string sRet = NSStringExt::CConverter::GetUtf8FromUTF32(pUnicodes, nCountSymbolsNatural);
RELEASEARRAYOBJECTS(pUnicodes);
return sRet;
}
return std::string((char*)pData, (size_t)len);
}