DocFormatReader - дешифрование + поддержка формата 95 года

2026-04-07 13:55:33 +08:00 · 2016-06-26 17:37:17 +03:00
parent 0d261b8769
commit de899687bb
118 changed files with 3963 additions and 3492 deletions
--- a/ASCOfficeDocFile/DocDocxConverter/PieceTable.cpp
+++ b/ASCOfficeDocFile/DocDocxConverter/PieceTable.cpp
@ -34,17 +34,22 @@

 namespace DocFileFormat
 {
-	PieceTable::PieceTable (FileInformationBlock *fib, POLE::Stream *tableStream)
+	PieceTable::PieceTable (FileInformationBlock *fib, POLE::Stream *tableStream, POLE::Stream* wordStream)
 	{
+		if (fib->m_FibWord97.lcbClx < 1/* || !fib->m_FibBase.fComplex*/) return;
+
 		// Read the bytes of complex file information
 		unsigned char* clx	=	new unsigned char[fib->m_FibWord97.lcbClx];

-		tableStream->seek(fib->m_FibWord97.fcClx);
-		tableStream->read(clx, (int)fib->m_FibWord97.lcbClx);
+		if (tableStream)
+		{
+			tableStream->seek(fib->m_FibWord97.fcClx);
+			tableStream->read(clx, (int)fib->m_FibWord97.lcbClx);
+		}

-		Pieces = list<PieceDescriptor>();
-		FileCharacterPositions = new map<int, int>();
-		CharacterPositions = new map<int, int>();
+		Pieces					= std::list<PieceDescriptor>();
+		FileCharacterPositions	= new std::map<int, int>();
+		CharacterPositions		= new std::map<int, int>();

 		int pos = 0;
 		bool goon = true;
@ -87,7 +92,7 @@ namespace DocFileFormat

 						memcpy(pcdBytes, (piecetable + indexPcd), 8);

-						PieceDescriptor pcd(pcdBytes, 8);
+						PieceDescriptor pcd(pcdBytes, 8, fib->m_CodePage);

 						pcd.cpStart	=	cp;
 						pcd.cpEnd	=	cpNext;
@ -99,7 +104,7 @@ namespace DocFileFormat
 						int f = (int)pcd.fc;
 						int multi = 1;

-						if ( pcd.encoding == ENCODING_UNICODE )
+						if ( pcd.code_page == ENCODING_UTF16 )
 						{
 							multi = 2;
 						}
@ -144,7 +149,6 @@ namespace DocFileFormat
 		}

 		RELEASEARRAYOBJECTS(clx);
-
 		m_carriageIter = Pieces.begin();
 	}

@ -161,14 +165,14 @@ namespace DocFileFormat
 	{
 		std::vector<wchar_t> *piecePairs = new std::vector<wchar_t>();

-		for ( list<PieceDescriptor>::iterator iter = this->Pieces.begin(); iter != this->Pieces.end(); ++iter)
+		for ( std::list<PieceDescriptor>::iterator iter = this->Pieces.begin(); iter != this->Pieces.end(); ++iter)
 		{
 			//get the FC end of this piece
 			PieceDescriptor pcd = *iter;

 			int pcdFcEnd = pcd.cpEnd - pcd.cpStart;

-			if (pcd.encoding == ENCODING_UNICODE)
+			if (pcd.code_page == ENCODING_UTF16)
 			{
 				pcdFcEnd *= 2;
 			}
@ -184,7 +188,7 @@ namespace DocFileFormat
 			stream->seek(pcd.fc);
 			stream->read(bytes, cb);

-			FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t> >(piecePairs, bytes, cb, pcd.encoding);
+			FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t> >(piecePairs, bytes, cb, pcd.code_page);

 			RELEASEARRAYOBJECTS(bytes);
 		}
@ -196,14 +200,14 @@ namespace DocFileFormat
 	{
 		std::vector<wchar_t> *encodingChars = new std::vector<wchar_t>();

-		for (list<PieceDescriptor>::iterator iter = Pieces.begin(); iter != Pieces.end(); ++iter)
+		for (std::list<PieceDescriptor>::iterator iter = Pieces.begin(); iter != Pieces.end(); ++iter)
 		{
 			PieceDescriptor pcd = *iter;

 			//get the FC end of this piece
 			int pcdFcEnd = pcd.cpEnd - pcd.cpStart;

-			if ( pcd.encoding == ENCODING_UNICODE )
+			if ( pcd.code_page == ENCODING_UTF16 )
 			{
 				pcdFcEnd *= 2;
 			}
@ -230,7 +234,7 @@ namespace DocFileFormat
 				wordStream->read( bytes, cb);

 				//get the chars
-				FormatUtils::GetSTLCollectionFromBytes<vector<wchar_t>>( encodingChars, bytes, cb, pcd.encoding );
+				FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t>>( encodingChars, bytes, cb, pcd.code_page );

 				RELEASEARRAYOBJECTS( bytes );
 			}
@ -249,7 +253,7 @@ namespace DocFileFormat
 				wordStream->read( bytes, cb);

 				//get the chars
-				FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t>>( encodingChars, bytes, cb, pcd.encoding );
+				FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t>>( encodingChars, bytes, cb, pcd.code_page );

 				RELEASEARRAYOBJECTS( bytes );
 			}
@ -268,7 +272,7 @@ namespace DocFileFormat
 				wordStream->read( bytes, cb);

 				//get the chars
-				FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t>>(encodingChars, bytes, cb, pcd.encoding);
+				FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t>>(encodingChars, bytes, cb, pcd.code_page);

 				RELEASEARRAYOBJECTS(bytes);

@ -291,7 +295,7 @@ namespace DocFileFormat
 				wordStream->read( bytes, cb );

 				//get the chars
-				FormatUtils::GetSTLCollectionFromBytes<vector<wchar_t>>( encodingChars, bytes, cb, pcd.encoding );
+				FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t>>( encodingChars, bytes, cb, pcd.code_page );

 				RELEASEARRAYOBJECTS( bytes );

@ -318,21 +322,11 @@ namespace DocFileFormat
 	{
 		std::vector<wchar_t>* encodingChars = new std::vector<wchar_t>();

-		//if (fcStart >= fcEnd)
-		//	return encodingChars;
-
-#ifdef _DEBUG
-		//if (fcStart == 3296 && fcEnd == 3326)
-		//{
-		//	int ccc = 0;
-		//}
-#endif
-
 		int fcSize = fcEnd - fcStart;

 		bool read = true;

-		for (list<PieceDescriptor>::iterator iter = Pieces.begin(); iter != Pieces.end(); ++iter)
+		for (std::list<PieceDescriptor>::iterator iter = Pieces.begin(); iter != Pieces.end(); ++iter)
 		{
 			PieceDescriptor pcd = (*iter);

@ -341,7 +335,7 @@ namespace DocFileFormat

 			int pcdFcEnd = pcd.cpEnd - pcd.cpStart;

-			if (pcd.encoding == ENCODING_UNICODE)
+			if (pcd.code_page == ENCODING_UTF16)
 			{
 				pcdFcEnd *= 2;
 			}
@ -362,7 +356,7 @@ namespace DocFileFormat
 				if (cb < 0)	
 					break;

-				if (!ReadSymbolsBuffer((int)fcStart, cb, pcd.encoding, word, encodingChars))
+				if (!ReadSymbolsBuffer((int)fcStart, cb, pcd.code_page, word, encodingChars))
 					break;

 				fcSize	-=	cb;
@ -382,7 +376,7 @@ namespace DocFileFormat
 				if (cb < 0)
 					break;

-				if (!ReadSymbolsBuffer((int)pcd.fc, cb, pcd.encoding, word, encodingChars))
+				if (!ReadSymbolsBuffer((int)pcd.fc, cb, pcd.code_page, word, encodingChars))
 					break;

 				fcSize	-=	cb;
@ -402,7 +396,7 @@ namespace DocFileFormat
 				if (cb <= 0)
 					break;				

-				if (!ReadSymbolsBuffer((int)pcd.fc, cb, pcd.encoding, word, encodingChars))
+				if (!ReadSymbolsBuffer((int)pcd.fc, cb, pcd.code_page, word, encodingChars))
 					break;

 				if (read)
@ -423,7 +417,7 @@ namespace DocFileFormat
 				if (cb <= 0) 
 					break;

-				if (!ReadSymbolsBuffer((int)fcStart, cb, pcd.encoding, word, encodingChars))
+				if (!ReadSymbolsBuffer((int)fcStart, cb, pcd.code_page, word, encodingChars))
 					break;

 				if (read)
@ -436,11 +430,6 @@ namespace DocFileFormat
 			}
 			else if (fcEnd < (int)pcd.fc)		//	this piece is beyond the requested range
 			{	
-#ifdef _DEBUG
-				//ATLTRACE(_T("PieceTable::GetChars() - fcEnd < (int)pcd.fc\n"));
-
-#endif
-
 				// имеет место быть перескок по стриму, поэтому корректируем начальную позицию

 				//size_t count = encodingChars->size();
@ -461,11 +450,11 @@ namespace DocFileFormat
 				//	int length = pcdFcEnd - pcd.fc;
 				//	if (length > fcSize)
 				//	{
-				//		ReadSymbolsBuffer((int)pcd.fc, fcSize, pcd.encoding, word, encodingChars);
+				//		ReadSymbolsBuffer((int)pcd.fc, fcSize, pcd.code_page, word, encodingChars);
 				//		break;
 				//	}

-				//	ReadSymbolsBuffer((int)pcd.fc, length, pcd.encoding, word, encodingChars);
+				//	ReadSymbolsBuffer((int)pcd.fc, length, pcd.code_page, word, encodingChars);
 				//	fcSize	-=	length;

 				//	continue;
@ -478,7 +467,7 @@ namespace DocFileFormat
 		return encodingChars;
 	}

-	inline bool PieceTable::ReadSymbolsBuffer(int pos, int size, Encoding encoding, POLE::Stream* word, std::vector<wchar_t>* encodingChars)
+	inline bool PieceTable::ReadSymbolsBuffer(int pos, int size, int coding, POLE::Stream* word, std::vector<wchar_t>* encodingChars)
 	{
 		unsigned char* bytes = new unsigned char[size];
 		if (NULL == bytes)
@ -488,7 +477,7 @@ namespace DocFileFormat
 		word->read(bytes, size);


-        FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t>>(encodingChars, bytes, size, encoding);
+        FormatUtils::GetSTLCollectionFromBytes<std::vector<wchar_t>>(encodingChars, bytes, size, coding);

        RELEASEARRAYOBJECTS(bytes);