Files
core/XlsxSerializerCom/Reader/CSVReader.cpp
ElenaSubbotina ed409e745a fix bug #36905
2018-02-12 19:44:59 +03:00

479 lines
15 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* (c) Copyright Ascensio System SIA 2010-2017
*
* This program is a free software product. You can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License (AGPL)
* version 3 as published by the Free Software Foundation. In accordance with
* Section 7(a) of the GNU AGPL its Section 15 shall be amended to the effect
* that Ascensio System SIA expressly excludes the warranty of non-infringement
* of any third-party rights.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For
* details, see the GNU AGPL at: http://www.gnu.org/licenses/agpl-3.0.html
*
* You can contact Ascensio System SIA at Lubanas st. 125a-25, Riga, Latvia,
* EU, LV-1021.
*
* The interactive user interfaces in modified source and object code versions
* of the Program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU AGPL version 3.
*
* Pursuant to Section 7(b) of the License you must retain the original Product
* logo when distributing the program. Pursuant to Section 7(e) we decline to
* grant you any rights under trademark law for use of our trademarks.
*
* All the Product's GUI elements, including illustrations and icon sets, as
* well as technical writing content are licensed under the terms of the
* Creative Commons Attribution-ShareAlike 4.0 International. See the License
* terms at http://creativecommons.org/licenses/by-sa/4.0/legalcode
*
*/
#include "CSVReader.h"
#include <map>
#include <locale>
#include "../../DesktopEditor/common/File.h"
#include "../../Common/DocxFormat/Source/Base/unicode_util.h"
#include "../../Common/OfficeFileErrorDescription.h"
#include "../../UnicodeConverter/UnicodeConverter.h"
#include "../../UnicodeConverter/UnicodeConverter_Encodings.h"
#include "../../Common/DocxFormat/Source/XlsxFormat/Workbook/Workbook.h"
#include "../../Common/DocxFormat/Source/XlsxFormat/SharedStrings/SharedStrings.h"
#include "../../Common/DocxFormat/Source/XlsxFormat/Styles/Styles.h"
namespace CSVReader
{
const std::wstring ansi_2_unicode(const unsigned char* data, DWORD data_size)
{
std::wstring result;
std::locale loc("");
std::ctype<wchar_t> const &facet = std::use_facet<std::ctype<wchar_t> >(loc);
result.resize(data_size);
facet.widen((char*)data, (char*)data + data_size, &result[0]);
return result;
}
const std::wstring utf8_2_unicode(const unsigned char* data, DWORD data_size)
{
if (sizeof(wchar_t) == 2)//utf8 -> utf16
{
unsigned int nLength = data_size;
UTF16 *pStrUtf16 = new UTF16 [nLength + 1];
memset ((void *) pStrUtf16, 0, sizeof (UTF16) * (nLength + 1));
UTF8 *pStrUtf8 = (UTF8 *) data;
const UTF8 *pStrUtf8_Conv = pStrUtf8;
UTF16 *pStrUtf16_Conv = pStrUtf16;
ConversionResult eUnicodeConversionResult = ConvertUTF8toUTF16 (&pStrUtf8_Conv, &pStrUtf8[nLength]
, &pStrUtf16_Conv, &pStrUtf16 [nLength]
, strictConversion);
if (conversionOK != eUnicodeConversionResult)
{
delete [] pStrUtf16;
return std::wstring();
}
std::wstring utf16Str ((wchar_t *) pStrUtf16);
delete [] pStrUtf16;
return utf16Str;
}
else //utf8 -> utf32
{
unsigned int nLength = data_size;
UTF32 *pStrUtf32 = new UTF32 [nLength + 1];
memset ((void *) pStrUtf32, 0, sizeof (UTF32) * (nLength + 1));
UTF8 *pStrUtf8 = (UTF8 *) data;
const UTF8 *pStrUtf8_Conv = pStrUtf8;
UTF32 *pStrUtf32_Conv = pStrUtf32;
ConversionResult eUnicodeConversionResult = ConvertUTF8toUTF32 (&pStrUtf8_Conv, &pStrUtf8[nLength]
, &pStrUtf32_Conv, &pStrUtf32 [nLength]
, strictConversion);
if (conversionOK != eUnicodeConversionResult)
{
delete [] pStrUtf32;
return std::wstring();
}
std::wstring utf32Str ((wchar_t *) pStrUtf32);
delete [] pStrUtf32;
return utf32Str;
}
}
const std::wstring utf16_2_unicode(const unsigned char* data, DWORD data_size)
{
if (sizeof(wchar_t) == 2)//utf16 -> utf16
{
return std::wstring((wchar_t*)data, data_size / 2);
}
else //utf16 -> utf32
{
unsigned int nLength = data_size / 2;
UTF32 *pStrUtf32 = new UTF32 [nLength + 1];
memset ((void *) pStrUtf32, 0, sizeof (UTF32) * (nLength + 1));
UTF16 *pStrUtf16 = (UTF16 *) data;
const UTF16 *pStrUtf16_Conv = pStrUtf16;
UTF32 *pStrUtf32_Conv = pStrUtf32;
ConversionResult eUnicodeConversionResult = ConvertUTF16toUTF32 (&pStrUtf16_Conv, &pStrUtf16[nLength]
, &pStrUtf32_Conv, &pStrUtf32 [nLength]
, strictConversion);
if (conversionOK != eUnicodeConversionResult)
{
delete [] pStrUtf32;
return std::wstring();
}
std::wstring utf32Str ((wchar_t *) pStrUtf32);
delete [] pStrUtf32;
return utf32Str;
}
}
const std::wstring utf32_2_unicode(const unsigned char* data, DWORD data_size)
{
if (sizeof(wchar_t) == 4)//utf32 -> utf32
{
return std::wstring((wchar_t*)data, data_size / 4);
}
else //utf32 -> utf16
{
unsigned int nLength = data_size / 4;
UTF16 *pStrUtf16 = new UTF16 [nLength + 1];
memset ((void *) pStrUtf16, 0, sizeof (UTF16) * (nLength + 1));
UTF32 *pStrUtf32 = (UTF32 *) data;
const UTF32 *pStrUtf32_Conv = pStrUtf32;
UTF16 *pStrUtf16_Conv = pStrUtf16;
ConversionResult eUnicodeConversionResult = ConvertUTF32toUTF16 (&pStrUtf32_Conv, &pStrUtf32[nLength]
, &pStrUtf16_Conv, &pStrUtf16 [nLength]
, strictConversion);
if (conversionOK != eUnicodeConversionResult)
{
delete [] pStrUtf16;
return std::wstring();
}
std::wstring utf16Str ((wchar_t *) pStrUtf16);
delete [] pStrUtf16;
return utf16Str;
}
}
//-----------------------------------------------------------------------------------------------
void AddCell(std::wstring &sText, INT nStartCell, std::stack<INT> &oDeleteChars, OOX::Spreadsheet::CRow &oRow, INT nRow, INT nCol, bool bIsWrap)
{
while(!oDeleteChars.empty())
{
INT nIndex = oDeleteChars.top() - nStartCell;
sText.erase(nIndex, 1);
oDeleteChars.pop();
}
// Пустую не пишем
if (0 == sText.length())
return;
OOX::Spreadsheet::CCell *pCell = new OOX::Spreadsheet::CCell();
pCell->m_oType.Init();
WCHAR *pEndPtr;
double dValue = wcstod(sText.c_str(), &pEndPtr);
if (NULL != *pEndPtr)
{
// Не число
pCell->m_oType->SetValue(SimpleTypes::Spreadsheet::celltypeInlineStr);
pCell->m_oRichText.Init();
OOX::Spreadsheet::CText *pText = new OOX::Spreadsheet::CText();
pText->m_sText = sText;
pCell->m_oRichText->m_arrItems.push_back(pText);
}
else
{
// Число
pCell->m_oValue.Init();
pCell->m_oValue->m_sText = sText;
}
if (bIsWrap)
{
// WrapStyle
pCell->m_oStyle.Init();
pCell->m_oStyle->SetValue(1);
}
pCell->setRowCol(nRow, nCol);
oRow.m_arrItems.push_back(pCell);
}
int ReadFromCsvToXlsx(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, UINT nCodePage, const std::wstring& sDelimiter)
{
NSFile::CFileBinary oFile;
if (false == oFile.OpenFile(sFileName)) return AVS_FILEUTILS_ERROR_CONVERT;
//-----------------------------------------------------------------------------------
// Создадим Workbook
oXlsx.CreateWorkbook();
// Создадим стили
oXlsx.CreateStyles();
// Добавим стили для wrap-а
oXlsx.m_pStyles->m_oCellXfs.Init();
oXlsx.m_pStyles->m_oCellXfs->m_oCount.Init();
oXlsx.m_pStyles->m_oCellXfs->m_oCount->SetValue(2);
// Normall default
OOX::Spreadsheet::CXfs* pXfs = NULL;
pXfs = new OOX::Spreadsheet::CXfs();
pXfs->m_oBorderId.Init();
pXfs->m_oBorderId->SetValue(0);
pXfs->m_oFillId.Init();
pXfs->m_oFillId->SetValue(0);
pXfs->m_oFontId.Init();
pXfs->m_oFontId->SetValue(0);
pXfs->m_oNumFmtId.Init();
pXfs->m_oNumFmtId->SetValue(0);
oXlsx.m_pStyles->m_oCellXfs->m_arrItems.push_back(pXfs);
// Wrap style
pXfs = new OOX::Spreadsheet::CXfs();
pXfs->m_oBorderId.Init();
pXfs->m_oBorderId->SetValue(0);
pXfs->m_oFillId.Init();
pXfs->m_oFillId->SetValue(0);
pXfs->m_oFontId.Init();
pXfs->m_oFontId->SetValue(0);
pXfs->m_oNumFmtId.Init();
pXfs->m_oNumFmtId->SetValue(0);
pXfs->m_oApplyAlignment.Init();
pXfs->m_oApplyAlignment->SetValue(SimpleTypes::onoffTrue);
pXfs->m_oAligment.Init();
pXfs->m_oAligment->m_oWrapText.Init();
pXfs->m_oAligment->m_oWrapText->SetValue(SimpleTypes::onoffTrue);
oXlsx.m_pStyles->m_oCellXfs->m_arrItems.push_back(pXfs);
std::wstring sSheetRId = L"rId1";
OOX::Spreadsheet::CWorksheet* pWorksheet = new OOX::Spreadsheet::CWorksheet(NULL);
pWorksheet->m_oSheetData.Init();
OOX::Spreadsheet::CSheet *pSheet = new OOX::Spreadsheet::CSheet();
pSheet->m_oName.Init();
pSheet->m_oName->append(L"Sheet1");
pSheet->m_oSheetId.Init();
pSheet->m_oSheetId->SetValue(1);
pSheet->m_oRid.Init();
pSheet->m_oRid->SetValue(sSheetRId);
oXlsx.m_pWorkbook->m_oSheets.Init();
oXlsx.m_pWorkbook->m_oSheets->m_arrItems.push_back(pSheet);
//-----------------------------------------------------------------------------------
DWORD nFileSize = 0;
BYTE* pFileData = new BYTE[oFile.GetFileSize()];
oFile.ReadFile(pFileData, oFile.GetFileSize(), nFileSize);
oFile.CloseFile();
//skip bom
DWORD nInputBufferSize = nFileSize;
BYTE* pInputBuffer = pFileData;
if (nInputBufferSize >= 3 && 0xef == pInputBuffer[0] && 0xbb == pInputBuffer[1] && 0xbf == pInputBuffer[2])
{
nInputBufferSize -= 3;
pInputBuffer += 3;
}
else if (nInputBufferSize >= 2 && ((0xfe == pInputBuffer[0] && 0xff == pInputBuffer[1]) || (0xff == pInputBuffer[0] && 0xfe == pInputBuffer[1])))
{
nInputBufferSize -= 2;
pInputBuffer += 2;
}
std::wstring sFileDataW;
if (nCodePage == 1000)
{
sFileDataW = ansi_2_unicode(pInputBuffer, nInputBufferSize);
}
else if (nCodePage == 46)//utf-8
{
sFileDataW = utf8_2_unicode(pInputBuffer, nInputBufferSize);
}
else if (nCodePage == 48)//utf-16
{
sFileDataW = utf16_2_unicode(pInputBuffer, nInputBufferSize);
}
else if (nCodePage == 50) // utf-32
{
sFileDataW = utf32_2_unicode(pInputBuffer, nInputBufferSize);
}
else
{
const NSUnicodeConverter::EncodindId& oEncodindId = NSUnicodeConverter::Encodings[nCodePage];
NSUnicodeConverter::CUnicodeConverter oUnicodeConverter;
sFileDataW = oUnicodeConverter.toUnicode((const char*)pInputBuffer, nInputBufferSize, oEncodindId.Name);
}
//------------------------------------------------------------------------------------------------------------------------------
size_t nSize = sFileDataW.length();
if (nSize < 1 && nInputBufferSize > 0)
{//для синхронности вывода превью и нормального результата
const NSUnicodeConverter::EncodindId& oEncodindId = NSUnicodeConverter::Encodings[nCodePage];
NSUnicodeConverter::CUnicodeConverter oUnicodeConverter;
sFileDataW = oUnicodeConverter.toUnicode((const char*)pInputBuffer, nInputBufferSize, oEncodindId.Name);
nSize = sFileDataW.length();
//return AVS_FILEUTILS_ERROR_CONVERT_ICU;
}
const WCHAR *pTemp = sFileDataW.c_str();
WCHAR wcDelimiterLeading = L'\0';
WCHAR wcDelimiterTrailing = L'\0';
int nDelimiterSize = 0;
if (sDelimiter.length() > 0)
{
wcDelimiterLeading = sDelimiter[0];
nDelimiterSize = 1;
if (2 == sizeof(wchar_t) && 0xD800 <= wcDelimiterLeading && wcDelimiterLeading <= 0xDBFF && sDelimiter.length() > 1)
{
wcDelimiterTrailing = sDelimiter[1];
nDelimiterSize = 2;
}
}
const WCHAR wcNewLineN = _T('\n');
const WCHAR wcNewLineR = _T('\r');
const WCHAR wcQuote = _T('"');
const WCHAR wcTab = _T('\t');
bool bIsWrap = false;
WCHAR wcCurrent;
INT nStartCell = 0;
std::stack<INT> oDeleteChars;
bool bInQuote = false;
INT nIndexRow = 0;
INT nIndexCol = 0;
OOX::Spreadsheet::CRow *pRow = new OOX::Spreadsheet::CRow();
pRow->m_oR.Init();
pRow->m_oR->SetValue(nIndexRow + 1);
for (size_t nIndex = 0; nIndex < nSize; ++nIndex)
{
wcCurrent = pTemp[nIndex];
if (wcDelimiterLeading == wcCurrent && (L'\0' == wcDelimiterTrailing || (nIndex + 1 < nSize && wcDelimiterTrailing == pTemp[nIndex + 1])))
{
if (bInQuote)
continue;
// New Cell
std::wstring sCellText(pTemp + nStartCell, nIndex - nStartCell);
AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap);
bIsWrap = false;
nStartCell = nIndex + nDelimiterSize;
if (nStartCell == nSize)
{
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
pRow = NULL;
}
}
else if (wcNewLineN == wcCurrent || wcNewLineR == wcCurrent)
{
if (bInQuote)
{
// Добавим Wrap
bIsWrap = true;
continue;
}
// New line
if (nStartCell != nIndex)
{
std::wstring sCellText(pTemp + nStartCell, nIndex - nStartCell);
AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap);
bIsWrap = false;
}
if (wcNewLineR == wcCurrent && nIndex + 1 != nSize && wcNewLineN == pTemp[nIndex + 1])
{
// На комбинацию \r\n должен быть только 1 перенос
++nIndex;
}
nStartCell = nIndex + 1;
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
pRow = new OOX::Spreadsheet::CRow();
pRow->m_oR.Init();
pRow->m_oR->SetValue(++nIndexRow + 1);
nIndexCol = 0;
}
else if (wcQuote == wcCurrent)
{
// Quote
if (false == bInQuote && nStartCell == nIndex && nIndex + 1 != nSize)
{
// Начало новой ячейки (только если мы сразу после разделителя и не в конце файла)
bInQuote = !bInQuote;
nStartCell = nIndex + 1;
}
else if ( bInQuote )
{
// Нужно удалить кавычку ограничитель
oDeleteChars.push(nIndex);
// Если следующий символ кавычка, то мы не закончили ограничитель строки (1997,Ford,E350,"Super, ""luxurious"" truck")
if (nIndex + 1 != nSize && wcQuote == pTemp[nIndex + 1])
++nIndex;
else
bInQuote = !bInQuote;
}
}
else if (wcTab == wcCurrent)
{
// delete tab if not delimiter
oDeleteChars.push(nIndex);
}
}
if (nStartCell != nSize)
{
// New line
std::wstring sCellText(pTemp + nStartCell, nSize - nStartCell);
AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap);
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
}
else
{
RELEASEOBJECT(pRow);
}
oXlsx.m_arWorksheets.push_back(pWorksheet);
oXlsx.m_mapWorksheets.insert(std::make_pair(sSheetRId, pWorksheet));
return 0;
}
}