Files
core/XlsxSerializerCom/Reader/CSVReader.cpp
Elena.Subbotina 65d6a6f56d fix bug #58624
2022-08-24 11:34:01 +03:00

594 lines
18 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* (c) Copyright Ascensio System SIA 2010-2019
*
* This program is a free software product. You can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License (AGPL)
* version 3 as published by the Free Software Foundation. In accordance with
* Section 7(a) of the GNU AGPL its Section 15 shall be amended to the effect
* that Ascensio System SIA expressly excludes the warranty of non-infringement
* of any third-party rights.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For
* details, see the GNU AGPL at: http://www.gnu.org/licenses/agpl-3.0.html
*
* You can contact Ascensio System SIA at 20A-12 Ernesta Birznieka-Upisha
* street, Riga, Latvia, EU, LV-1050.
*
* The interactive user interfaces in modified source and object code versions
* of the Program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU AGPL version 3.
*
* Pursuant to Section 7(b) of the License you must retain the original Product
* logo when distributing the program. Pursuant to Section 7(e) we decline to
* grant you any rights under trademark law for use of our trademarks.
*
* All the Product's GUI elements, including illustrations and icon sets, as
* well as technical writing content are licensed under the terms of the
* Creative Commons Attribution-ShareAlike 4.0 International. See the License
* terms at http://creativecommons.org/licenses/by-sa/4.0/legalcode
*
*/
#include "CSVReader.h"
#include "../../Common/DocxFormat/Source/XlsbFormat/Xlsb.h"
#include "../../Common/DocxFormat/Source/XlsxFormat/Worksheets/Worksheet.h"
#include "../../DesktopEditor/common/File.h"
#include <map>
#include <locale>
#include "../../DesktopEditor/common/File.h"
#include "../../Common/DocxFormat/Source/Base/unicode_util.h"
#include "../../Common/OfficeFileErrorDescription.h"
#include "../../UnicodeConverter/UnicodeConverter.h"
#include "../../UnicodeConverter/UnicodeConverter_Encodings.h"
#include "../../Common/DocxFormat/Source/XlsxFormat/Workbook/Workbook.h"
#include "../../Common/DocxFormat/Source/XlsxFormat/SharedStrings/SharedStrings.h"
#include "../../Common/DocxFormat/Source/XlsxFormat/Styles/Styles.h"
class CSVReader::Impl
{
public:
Impl() {}
_UINT32 Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& wcDelimiter);
private:
void AddCell(std::wstring &sText, INT nStartCell, std::stack<INT> &oDeleteChars, OOX::Spreadsheet::CRow &oRow, INT nRow, INT nCol, bool bIsWrap);
std::map<std::wstring, unsigned int> mapDataNumber;
OOX::Spreadsheet::CStyles *m_pStyles = NULL;
//---------------------------------------------------------------------------------------------------------
const std::wstring ansi_2_unicode(const unsigned char* data, DWORD data_size)
{
std::wstring result;
std::locale loc("");
std::ctype<wchar_t> const &facet = std::use_facet<std::ctype<wchar_t> >(loc);
result.resize(data_size);
facet.widen((char*)data, (char*)data + data_size, &result[0]);
return result;
}
void utf8_2_unicode(const unsigned char* data, DWORD data_size, std::wstring &wStr)
{
wStr.resize(data_size + 1);
unsigned int nLength = data_size;
ConversionResult eUnicodeConversionResult = conversionOK;
if (sizeof(wchar_t) == 2)//utf8 -> utf16
{
memset ((void *)wStr.data(), 0, sizeof (UTF16) * (nLength + 1));
UTF8 *pStrUtf8 = (UTF8 *) data;
const UTF8 *pStrUtf8_Conv = pStrUtf8;
UTF16 *pStrUtf16_Conv = (UTF16 *)wStr.data();
eUnicodeConversionResult = ConvertUTF8toUTF16 (&pStrUtf8_Conv, &pStrUtf8[nLength]
, &pStrUtf16_Conv, &((UTF16 *)wStr.data())[nLength]
, strictConversion);
}
else //utf8 -> utf32
{
//UTF32 *pStrUtf32 = new UTF32 [nLength + 1];
//memset ((void *) pStrUtf32, 0, sizeof (UTF32) * (nLength + 1));
memset((void *)wStr.data(), 0, sizeof(UTF16) * (nLength + 1));
UTF8 *pStrUtf8 = (UTF8 *) data;
const UTF8 *pStrUtf8_Conv = pStrUtf8;
UTF32 *pStrUtf32_Conv = (UTF32 *)wStr.data();
eUnicodeConversionResult = ConvertUTF8toUTF32 (&pStrUtf8_Conv, &pStrUtf8[nLength]
, &pStrUtf32_Conv, &((UTF32 *)wStr.data())[nLength]
, strictConversion);
}
if (conversionOK != eUnicodeConversionResult)
{
wStr.clear();
std::string inp((char*)data, data_size);
wStr = std::wstring(inp.begin(), inp.end());
}
}
const std::wstring utf16_2_unicode(const unsigned char* data, DWORD data_size)
{
if (sizeof(wchar_t) == 2)//utf16 -> utf16
{
return std::wstring((wchar_t*)data, data_size / 2);
}
else //utf16 -> utf32
{
unsigned int nLength = data_size / 2;
UTF32 *pStrUtf32 = new UTF32 [nLength + 1];
memset ((void *) pStrUtf32, 0, sizeof (UTF32) * (nLength + 1));
UTF16 *pStrUtf16 = (UTF16 *) data;
const UTF16 *pStrUtf16_Conv = pStrUtf16;
UTF32 *pStrUtf32_Conv = pStrUtf32;
ConversionResult eUnicodeConversionResult = ConvertUTF16toUTF32 (&pStrUtf16_Conv, &pStrUtf16[nLength]
, &pStrUtf32_Conv, &pStrUtf32 [nLength]
, strictConversion);
if (conversionOK != eUnicodeConversionResult)
{
delete [] pStrUtf32;
return std::wstring();
}
std::wstring utf32Str ((wchar_t *) pStrUtf32);
delete [] pStrUtf32;
return utf32Str;
}
}
const std::wstring utf32_2_unicode(const unsigned char* data, DWORD data_size)
{
if (sizeof(wchar_t) == 4)//utf32 -> utf32
{
return std::wstring((wchar_t*)data, data_size / 4);
}
else //utf32 -> utf16
{
unsigned int nLength = data_size / 4;
UTF16 *pStrUtf16 = new UTF16 [nLength + 1];
memset ((void *) pStrUtf16, 0, sizeof (UTF16) * (nLength + 1));
UTF32 *pStrUtf32 = (UTF32 *) data;
const UTF32 *pStrUtf32_Conv = pStrUtf32;
UTF16 *pStrUtf16_Conv = pStrUtf16;
ConversionResult eUnicodeConversionResult = ConvertUTF32toUTF16 (&pStrUtf32_Conv, &pStrUtf32[nLength]
, &pStrUtf16_Conv, &pStrUtf16 [nLength]
, strictConversion);
if (conversionOK != eUnicodeConversionResult)
{
delete [] pStrUtf16;
return std::wstring();
}
std::wstring utf16Str ((wchar_t *) pStrUtf16);
delete [] pStrUtf16;
return utf16Str;
}
}
};
//-----------------------------------------------------------------------------------------------
void CSVReader::Impl::AddCell(std::wstring &sText, INT nStartCell, std::stack<INT> &oDeleteChars, OOX::Spreadsheet::CRow &oRow, INT nRow, INT nCol, bool bIsWrap)
{
while (!oDeleteChars.empty())
{
INT nIndex = oDeleteChars.top() - nStartCell;
sText.erase(nIndex, 1);
oDeleteChars.pop();
}
size_t length = sText.length();
// Пустую не пишем
if ((0 == length) || (sText[0] == L'\0'))
return;
OOX::Spreadsheet::CCell *pCell = new OOX::Spreadsheet::CCell();
pCell->m_oType.Init();
pCell->m_oCacheValue = sText; // как есть
wchar_t *pEndPtr;
double dValue = wcstod(sText.c_str(), &pEndPtr);
if (std::isnan(dValue) || std::isinf(dValue))
pEndPtr = (wchar_t *)sText.c_str();
if ((0 == *pEndPtr) || (pEndPtr != sText.c_str() && (sText.c_str() + length - pEndPtr) < 4))
{
std::wstring data_format;
std::wstring postfix;
if (0 != *pEndPtr)
{
size_t sz = length - (pEndPtr - sText.c_str());
while (sz > 0)
{
if (pEndPtr[sz - 1] != L' ')
break;
sz--;
}
if (sz > 0)
{
postfix = std::wstring(pEndPtr, sz);
}
}
size_t pos = sText.find(L".");
if (pos != std::wstring::npos)
{
size_t fraction = sText.length() - pos - ((0 != *pEndPtr) ? 2 : 1);
for (size_t i = 0; i < fraction && fraction != std::wstring::npos; ++i)
data_format += L"0";
}
if (false == data_format.empty()) data_format = L"." + data_format;
pCell->m_oValue.Init();
if (0 != *pEndPtr)
{
if (false == postfix.empty())
{
if (postfix[0] == L'%')
{
pCell->m_oValue->m_sText = std::to_wstring(dValue / 100.);
}
else
{
pCell->m_oValue->m_sText = sText.substr(0, length - 1);
for (size_t i = 0; i < postfix.size(); ++i)
{
data_format += std::wstring(L"\\") + postfix[i];
}
}
}
}
else
pCell->m_oValue->m_sText = sText;
if (false == data_format.empty())
{
data_format = L"0" + data_format;
std::map<std::wstring, unsigned int>::iterator pFind = mapDataNumber.find(data_format);
if (pFind != mapDataNumber.end())
{
pCell->m_oStyle = pFind->second;
}
else
{
if (!m_pStyles->m_oNumFmts.IsInit()) m_pStyles->m_oNumFmts.Init();
m_pStyles->m_oNumFmts->m_arrItems.push_back(new OOX::Spreadsheet::CNumFmt());
m_pStyles->m_oNumFmts->m_arrItems.back()->m_oFormatCode = data_format;
m_pStyles->m_oNumFmts->m_arrItems.back()->m_oNumFmtId.Init();
m_pStyles->m_oNumFmts->m_arrItems.back()->m_oNumFmtId->SetValue(164 + m_pStyles->m_oNumFmts->m_arrItems.size());
// Normal + data format
OOX::Spreadsheet::CXfs* pXfs = new OOX::Spreadsheet::CXfs();
pXfs->m_oBorderId.Init(); pXfs->m_oBorderId->SetValue(0);
pXfs->m_oFillId.Init(); pXfs->m_oFillId->SetValue(0);
pXfs->m_oFontId.Init(); pXfs->m_oFontId->SetValue(0);
pXfs->m_oNumFmtId.Init(); pXfs->m_oNumFmtId->SetValue(m_pStyles->m_oNumFmts->m_arrItems.back()->m_oNumFmtId->GetValue());
m_pStyles->m_oCellXfs->m_arrItems.push_back(pXfs);
pCell->m_oStyle = (unsigned int)(m_pStyles->m_oCellXfs->m_arrItems.size() - 1);
mapDataNumber.insert(std::make_pair(data_format, *pCell->m_oStyle));
}
}
}
else
{
pCell->m_oType->SetValue(SimpleTypes::Spreadsheet::celltypeInlineStr);
pCell->m_oRichText.Init();
OOX::Spreadsheet::CText *pText = new OOX::Spreadsheet::CText();
pText->m_sText = sText;
pCell->m_oRichText->m_arrItems.push_back(pText);
}
if (bIsWrap)
{
pCell->m_oStyle = 1;
}
pCell->setRowCol(nRow, nCol);
oRow.m_arrItems.push_back(pCell);
}
_UINT32 CSVReader::Impl::Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& sDelimiter)
{
NSFile::CFileBinary oFile;
if (false == oFile.OpenFile(sFileName)) return AVS_FILEUTILS_ERROR_CONVERT;
//-----------------------------------------------------------------------------------
// Создадим Workbook
oXlsx.CreateWorkbook();
// Создадим стили
oXlsx.CreateStyles();
// Добавим стили для wrap-а
oXlsx.m_pStyles->m_oCellXfs.Init();
oXlsx.m_pStyles->m_oCellXfs->m_oCount.Init();
oXlsx.m_pStyles->m_oCellXfs->m_oCount->SetValue(2);
m_pStyles = oXlsx.m_pStyles;
// Normall default
OOX::Spreadsheet::CXfs* pXfs = NULL;
pXfs = new OOX::Spreadsheet::CXfs();
pXfs->m_oBorderId.Init(); pXfs->m_oBorderId->SetValue(0);
pXfs->m_oFillId.Init(); pXfs->m_oFillId->SetValue(0);
pXfs->m_oFontId.Init(); pXfs->m_oFontId->SetValue(0);
pXfs->m_oNumFmtId.Init(); pXfs->m_oNumFmtId->SetValue(0);
oXlsx.m_pStyles->m_oCellXfs->m_arrItems.push_back(pXfs);
// Wrap style
pXfs = new OOX::Spreadsheet::CXfs();
pXfs->m_oBorderId.Init(); pXfs->m_oBorderId->SetValue(0);
pXfs->m_oFillId.Init(); pXfs->m_oFillId->SetValue(0);
pXfs->m_oFontId.Init(); pXfs->m_oFontId->SetValue(0);
pXfs->m_oNumFmtId.Init(); pXfs->m_oNumFmtId->SetValue(0);
pXfs->m_oApplyAlignment.Init(); pXfs->m_oApplyAlignment->SetValue(SimpleTypes::onoffTrue);
pXfs->m_oAligment.Init(); pXfs->m_oAligment->m_oWrapText.Init();
pXfs->m_oAligment->m_oWrapText->SetValue(SimpleTypes::onoffTrue);
oXlsx.m_pStyles->m_oCellXfs->m_arrItems.push_back(pXfs);
smart_ptr<OOX::Spreadsheet::CWorksheet> pWorksheet(new OOX::Spreadsheet::CWorksheet(NULL));
pWorksheet->m_oSheetData.Init();
//-----------------------------------------------------------------------------------
DWORD nFileSize = 0;
BYTE* pFileData = new BYTE[oFile.GetFileSize() + 64];
oFile.ReadFile(pFileData, oFile.GetFileSize(), nFileSize);
oFile.CloseFile();
//skip bom
DWORD nInputBufferSize = nFileSize;
BYTE* pInputBuffer = pFileData;
if (nInputBufferSize >= 3 && 0xef == pInputBuffer[0] && 0xbb == pInputBuffer[1] && 0xbf == pInputBuffer[2])
{
nInputBufferSize -= 3;
pInputBuffer += 3;
}
else if (nInputBufferSize >= 2 && ((0xfe == pInputBuffer[0] && 0xff == pInputBuffer[1]) || (0xff == pInputBuffer[0] && 0xfe == pInputBuffer[1])))
{
nInputBufferSize -= 2;
pInputBuffer += 2;
}
std::wstring sFileDataW;
if (nCodePage == 1000)
{
sFileDataW = ansi_2_unicode(pInputBuffer, nInputBufferSize);
}
else if (nCodePage == 46)//utf-8
{
utf8_2_unicode(pInputBuffer, nInputBufferSize, sFileDataW);
}
else if (nCodePage == 48)//utf-16
{
sFileDataW = utf16_2_unicode(pInputBuffer, nInputBufferSize);
}
else if (nCodePage == 50) // utf-32
{
sFileDataW = utf32_2_unicode(pInputBuffer, nInputBufferSize);
}
else
{
const NSUnicodeConverter::EncodindId& oEncodindId = NSUnicodeConverter::Encodings[nCodePage];
NSUnicodeConverter::CUnicodeConverter oUnicodeConverter;
sFileDataW = oUnicodeConverter.toUnicode((const char*)pInputBuffer, nInputBufferSize, oEncodindId.Name);
}
//------------------------------------------------------------------------------------------------------------------------------
delete[]pFileData;
size_t nSize = sFileDataW.length();
if (nSize < 1 && nInputBufferSize > 0)
{//для синхронности вывода превью и нормального результата
const NSUnicodeConverter::EncodindId& oEncodindId = NSUnicodeConverter::Encodings[nCodePage];
NSUnicodeConverter::CUnicodeConverter oUnicodeConverter;
sFileDataW = oUnicodeConverter.toUnicode((const char*)pInputBuffer, nInputBufferSize, oEncodindId.Name);
nSize = sFileDataW.length();
//return AVS_FILEUTILS_ERROR_CONVERT_ICU;
}
WCHAR wcDelimiterLeading = L'\0';
WCHAR wcDelimiterTrailing = L'\0';
int nDelimiterSize = 0;
if (sDelimiter.length() > 0)
{
wcDelimiterLeading = sDelimiter[0];
nDelimiterSize = 1;
if (2 == sizeof(wchar_t) && 0xD800 <= wcDelimiterLeading && wcDelimiterLeading <= 0xDBFF && sDelimiter.length() > 1)
{
wcDelimiterTrailing = sDelimiter[1];
nDelimiterSize = 2;
}
}
const WCHAR wcNewLineN = _T('\n');
const WCHAR wcNewLineR = _T('\r');
const WCHAR wcQuote = _T('"');
const WCHAR wcTab = _T('\t');
bool bIsWrap = false;
WCHAR wcCurrent;
INT nStartCell = 0;
std::stack<INT> oDeleteChars;
bool bMsLimit = false;
bool bInQuote = false;
INT nIndexRow = 0;
INT nIndexCol = 0;
OOX::Spreadsheet::CRow *pRow = new OOX::Spreadsheet::CRow();
pRow->m_oR.Init();
pRow->m_oR->SetValue(nIndexRow + 1);
const WCHAR *pTemp = sFileDataW.c_str();
for (size_t nIndex = 0; nIndex < nSize; ++nIndex)
{
wcCurrent = pTemp[nIndex];
if (wcDelimiterLeading == wcCurrent && (L'\0' == wcDelimiterTrailing || (nIndex + 1 < nSize && wcDelimiterTrailing == pTemp[nIndex + 1])))
{
if (bInQuote)
continue;
// New Cell
std::wstring sCellText(pTemp + nStartCell, nIndex - nStartCell);
AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap);
oDeleteChars = std::stack<INT>();
bIsWrap = false;
if (nIndex + nDelimiterSize == nSize)
{
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
pRow = NULL;
}
if (nIndex + nDelimiterSize > 500000)
{
nStartCell = 0;
sFileDataW.erase(0, nIndex + nDelimiterSize);
nSize -= (nIndex + nDelimiterSize); nIndex = 0;
pTemp = sFileDataW.c_str();
}
else
nStartCell = nIndex + nDelimiterSize;
}
else if (wcNewLineN == wcCurrent || wcNewLineR == wcCurrent)
{
if (bInQuote)
{
// Добавим Wrap
bIsWrap = true;
continue;
}
// New line
if (nStartCell != nIndex)
{
std::wstring sCellText(pTemp + nStartCell, nIndex - nStartCell);
AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap);
bIsWrap = false;
}
if (wcNewLineR == wcCurrent && nIndex + 1 != nSize && wcNewLineN == pTemp[nIndex + 1])
{
// На комбинацию \r\n должен быть только 1 перенос
++nIndex;
}
if (nIndex + 1 > 500000)
{
nStartCell = 0;
sFileDataW.erase(0, nIndex + 1);
nSize -= (nIndex + 1); nIndex = 0;
pTemp = sFileDataW.c_str();
}
else
nStartCell = nIndex + 1;
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
pRow = new OOX::Spreadsheet::CRow();
pRow->m_oR.Init();
pRow->m_oR->SetValue(++nIndexRow + 1);
nIndexCol = 0;
if (pWorksheet->m_oSheetData->m_arrItems.size() > 1048576)
{
bMsLimit = true;
break; // ограниечние мс
}
}
else if (wcQuote == wcCurrent)
{
// Quote
if (false == bInQuote && nStartCell == nIndex && nIndex + 1 != nSize)
{
// Начало новой ячейки (только если мы сразу после разделителя и не в конце файла)
bInQuote = !bInQuote;
nStartCell = nIndex + 1;
}
else if (bInQuote)
{
// Нужно удалить кавычку ограничитель
oDeleteChars.push(nIndex);
// Если следующий символ кавычка, то мы не закончили ограничитель строки (1997,Ford,E350,"Super, ""luxurious"" truck")
if (nIndex + 1 != nSize && wcQuote == pTemp[nIndex + 1])
++nIndex;
else
bInQuote = !bInQuote;
}
}
else if (wcTab == wcCurrent)
{
// delete tab if not delimiter
oDeleteChars.push(nIndex);
}
}
if (nStartCell != nSize && !bMsLimit)
{
// New line
std::wstring sCellText(pTemp + nStartCell, nSize - nStartCell);
AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap);
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
}
else
{
RELEASEOBJECT(pRow);
}
oXlsx.m_arWorksheets.push_back(pWorksheet.GetPointer());
smart_ptr<OOX::File> oWorksheetFile = pWorksheet.smart_dynamic_cast<OOX::File>();
const OOX::RId oRid = oXlsx.m_pWorkbook->Add(oWorksheetFile);
oXlsx.m_mapWorksheets.insert(std::make_pair(oRid.ToString(), pWorksheet.GetPointer())); // for bin
OOX::Spreadsheet::CSheet *pSheet = new OOX::Spreadsheet::CSheet();
pSheet->m_oName = L"Sheet1";
pSheet->m_oSheetId.Init();
pSheet->m_oSheetId->SetValue(1);
pSheet->m_oRid.Init();
pSheet->m_oRid->SetValue(oRid.ToString());
oXlsx.m_pWorkbook->m_oSheets.Init();
oXlsx.m_pWorkbook->m_oSheets->m_arrItems.push_back(pSheet);
return bMsLimit ? AVS_FILEUTILS_ERROR_CONVERT_ROWLIMITS : 0;
}
//----------------------------------------------------------------------------------
CSVReader::CSVReader() : impl_(new CSVReader::Impl())
{
}
CSVReader::~CSVReader()
{
}
_UINT32 CSVReader::Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& sDelimiter)
{
return impl_->Read(sFileName, oXlsx, nCodePage, sDelimiter);
}