core/ASCOfficePDFReader/CharCodeToUnicode.cpp

#include "stdafx.h"

#include <stdio.h>
#include <string.h>
#include "MemoryUtils.h"
#include "File.h"
#include "StringExt.h"
#include "GlobalParams.h"
#include "PSLexer.h"
#include "CharCodeToUnicode.h"

//-------------------------------------------------------------------------------------------------------------------------------

#define MaxUnicodeString 8

struct CharCodeToUnicodeString
{
	CharCode nCode;
	Unicode  pUnicodeString[MaxUnicodeString];
	int      nLen;
};

//-------------------------------------------------------------------------------------------------------------------------------

static int GetCharFromString(void *pData)
{
	int nChar = 0;

	char *sString = *(char **)pData;

	if (*sString)
	{
		nChar = *sString++;
		*(char **)pData = sString;
	}
	else
	{
		nChar = EOF;
	}
	return nChar;
}

static int GetCharFromFile(void *pData)
{
	return fgetc((FILE *)pData);
}

//-------------------------------------------------------------------------------------------------------------------------------

CharCodeToUnicode *CharCodeToUnicode::ParseCIDToUnicode(StringExt *seFileName, StringExt *seCollection)
{
	FILE *pFile = NULL;

	char sBuffer[64];
	Unicode nUnicode = 0;

	if ( !( pFile = fopen(seFileName->GetBuffer(), "r") ) )
	{
		// TO DO: Error "Couldn't open cidToUnicode file"
		return NULL;
	}

	unsigned int nSize = 32768;
	Unicode *pMap = (Unicode *)MemUtilsMallocArray( nSize, sizeof(Unicode));
	unsigned int nMapLen = 0;

	while ( GetLine( sBuffer, sizeof(sBuffer), pFile ) )
	{
		if ( nMapLen == nSize )
		{
			nSize *= 2;
			pMap = (Unicode *)MemUtilsReallocArray( pMap, nSize, sizeof(Unicode));
		}
		if ( sscanf( sBuffer, "%x", &nUnicode ) == 1)
		{
			pMap[nMapLen] = nUnicode;
		}
		else
		{
			// TO DO: Error "Bad line in cidToUnicode file"
			pMap[nMapLen] = 0;
		}
		++nMapLen;
	}
	fclose(pFile);

	CharCodeToUnicode *pCharCodeToUnicode = new CharCodeToUnicode( seCollection->Copy(), pMap, nMapLen, TRUE, NULL, 0, 0);
	MemUtilsFree(pMap);
	return pCharCodeToUnicode;
}

CharCodeToUnicode *CharCodeToUnicode::ParseUnicodeToUnicode(StringExt *seFileName)
{
	FILE *pFile = NULL;


	char sBuffer[256];
	char *sToken;
	Unicode nUnicode0;
	Unicode arrUnicodeBuffer[MaxUnicodeString];

	if ( !( pFile = fopen(seFileName->GetBuffer(), "r") ) )
	{
		// TO DO: Error "Couldn't open unicodeToUnicode file"
		return NULL;
	}

	unsigned int nSize = 4096;
	Unicode *pMap = (Unicode *)MemUtilsMallocArray( nSize, sizeof(Unicode));
	memset( pMap, 0, nSize * sizeof(Unicode));
	unsigned int nLen = 0;
	CharCodeToUnicodeString *pSMap = NULL;
	unsigned int nSMapSize = 0, nSMapLen = 0;

	int nLine = 0;
	while ( GetLine( sBuffer, sizeof(sBuffer), pFile ) )
	{
		++nLine;
		if ( !( sToken = strtok(sBuffer, " \t\r\n") ) || sscanf( sToken, "%x", &nUnicode0) != 1 )
		{
			// TO DO: Error "Bad line in unicodeToUnicode file"
			continue;
		}
		int nCount = 0;
		while ( nCount < MaxUnicodeString )
		{
			if ( !( sToken = strtok(NULL, " \t\r\n") ) )
			{
				break;
			}
			if ( sscanf( sToken, "%x", &arrUnicodeBuffer[nCount]) != 1 )
			{
				// TO DO: Error "Bad line in unicodeToUnicode file"
				break;
			}
			++nCount;
		}
		if ( nCount < 1 )
		{
			// TO DO: Error "Bad line in unicodeToUnicode file"
			continue;
		}
		if ( nUnicode0 >= nSize )
		{
			unsigned int nOldSize = nSize;
			while ( nUnicode0 >= nSize )
			{
				nSize *= 2;
			}
			pMap = (Unicode *)MemUtilsReallocArray( pMap, nSize, sizeof(Unicode));
			memset( pMap + nOldSize, 0, (nSize - nOldSize) * sizeof(Unicode));
		}
		if ( nCount == 1 )
		{
			pMap[nUnicode0] = arrUnicodeBuffer[0];
		}
		else
		{
			pMap[nUnicode0] = 0;
			if ( nSMapLen == nSMapSize )
			{
				nSMapSize += 16;
				pSMap = (CharCodeToUnicodeString *) MemUtilsReallocArray( pSMap, nSMapSize, sizeof(CharCodeToUnicodeString));
			}
			pSMap[nSMapLen].nCode = nUnicode0;
			for ( int nIndex = 0; nIndex < nCount; ++nIndex )
			{
				pSMap[nSMapLen].pUnicodeString[nIndex] = arrUnicodeBuffer[nIndex];
			}
			pSMap[nSMapLen].nLen = nCount;
			++nSMapLen;
		}
		if ( nUnicode0 >= nLen )
		{
			nLen = nUnicode0 + 1;
		}
	}
	fclose(pFile);

	CharCodeToUnicode *pCharCodeToUnicode = new CharCodeToUnicode( seFileName->Copy(), pMap, nLen, TRUE, pSMap, nSMapLen, nSMapSize);
	MemUtilsFree(pMap);
	return pCharCodeToUnicode;
}

CharCodeToUnicode *CharCodeToUnicode::Make8BitToUnicode(Unicode *pToUnicode)
{
	return new CharCodeToUnicode( NULL, pToUnicode, 256, TRUE, NULL, 0, 0);
}

CharCodeToUnicode *CharCodeToUnicode::ParseCMap(StringExt *seBuffer, int nBitCount, GlobalParams *pGlobalParams)
{
	CharCodeToUnicode *pCharCodeToUnicode = new CharCodeToUnicode(NULL);
	char *pData = seBuffer->GetBuffer();
	pCharCodeToUnicode->ParseCMap1( &GetCharFromString, &pData, nBitCount, pGlobalParams);
	return pCharCodeToUnicode;
}

void CharCodeToUnicode::MergeCMap(StringExt *seBuffer, int nBitCount, GlobalParams *pGlobalParams)
{
	char *pData = seBuffer->GetBuffer();
	ParseCMap1( &GetCharFromString, &pData, nBitCount, pGlobalParams);
}

void CharCodeToUnicode::ParseCMap1(int (*GetCharFunc)(void *), void *pData, int nBitCount, GlobalParams *pGlobalParams)
{
	char sToken1[256], sToken2[256], sToken3[256];
	int nLen1, nLen2, nLen3;
	CharCode nCode1, nCode2;


	int nDigitCount = nBitCount / 4;
	PSLexer *pLexer = new PSLexer( GetCharFunc, pData);
	pLexer->GetToken( sToken1, sizeof(sToken1), &nLen1);

	while ( pLexer->GetToken( sToken2, sizeof(sToken2), &nLen2) )
	{
		if ( !strcmp( sToken2, "usecmap" ) )
		{
			if ( sToken1[0] == '/' )
			{
				StringExt *seName = new StringExt(sToken1 + 1);
				FILE *pFile = NULL;
				if ( pGlobalParams && ( pFile = pGlobalParams->FindToUnicodeFile(seName) ) )
				{
					ParseCMap1( &GetCharFromFile, pFile, nBitCount, pGlobalParams );
					fclose(pFile);
				}
				else
				{
					// TO DO: Error "Couldn't find ToUnicode CMap file"
				}
				delete seName;
			}
			pLexer->GetToken( sToken1, sizeof(sToken1), &nLen1);
		}
		else if ( !strcmp( sToken2, "beginbfchar") )
		{
			while ( pLexer->GetToken( sToken1, sizeof(sToken1), &nLen1) )
			{
				if ( !strcmp( sToken1, "endbfchar") )
				{
					break;
				}
				if ( !pLexer->GetToken( sToken2, sizeof(sToken2), &nLen2 ) || !strcmp( sToken2, "endbfchar") )
				{
					// TO DO: Error "Illegal entry in bfchar block in ToUnicode CMap"
					break;
				}
				if ( !( nLen1 == 2 + nDigitCount && sToken1[0] == '<' && sToken1[nLen1 - 1] == '>' && sToken2[0] == '<' && sToken2[nLen2 - 1] == '>' ) )
				{
					// TO DO: Error "Illegal entry in bfchar block in ToUnicode CMap"
					continue;
				}
				sToken1[nLen1 - 1] = sToken2[nLen2 - 1] = '\0';
				if ( sscanf( sToken1 + 1, "%x", &nCode1) != 1 )
				{
					// TO DO: Error "Illegal entry in bfchar block in ToUnicode CMap"
					continue;
				}
				AddMapping( nCode1, sToken2 + 1, nLen2 - 2, 0);
			}
			pLexer->GetToken( sToken1, sizeof(sToken1), &nLen1);
		}
		else if ( !strcmp( sToken2, "beginbfrange") )
		{
			while ( pLexer->GetToken( sToken1, sizeof(sToken1), &nLen1) )
			{
				if ( !strcmp( sToken1, "endbfrange") )
				{
					break;
				}
				if ( !pLexer->GetToken( sToken2, sizeof(sToken2), &nLen2) || !strcmp( sToken2, "endbfrange") || !pLexer->GetToken( sToken3, sizeof(sToken3), &nLen3) || !strcmp( sToken3, "endbfrange") )
				{
					// TO DO: Error "Illegal entry in bfrange block in ToUnicode CMap"
					break;
				}
				if ( !( nLen1 == 2 + nDigitCount && sToken1[0] == '<' && sToken1[nLen1 - 1] == '>' && nLen2 == 2 + nDigitCount && sToken2[0] == '<' && sToken2[nLen2 - 1] == '>' ) )
				{
					// TO DO: Error "Illegal entry in bfrange block in ToUnicode CMap"
					continue;
				}
				sToken1[nLen1 - 1] = sToken2[nLen2 - 1] = '\0';
				if ( sscanf( sToken1 + 1, "%x", &nCode1) != 1 || sscanf( sToken2 + 1, "%x", &nCode2) != 1 )
				{
					// TO DO: Error "Illegal entry in bfrange block in ToUnicode CMap"
					continue;
				}
				if ( !strcmp( sToken3, "[") )
				{
					int nIndex = 0;
					while ( pLexer->GetToken( sToken1, sizeof(sToken1), &nLen1) && nCode1 + nIndex <= nCode2 )
					{
						if ( !strcmp(sToken1, "]") )
						{
							break;
						}
						if ( sToken1[0] == '<' && sToken1[nLen1 - 1] == '>' )
						{
							sToken1[nLen1 - 1] = '\0';
							AddMapping( nCode1 + nIndex, sToken1 + 1, nLen1 - 2, 0);
						}
						else
						{
							// TO DO: Error "Illegal entry in bfrange block in ToUnicode CMap"
						}
						++nIndex;
					}
				}
				else if ( sToken3[0] == '<' && sToken3[nLen3 - 1] == '>' )
				{
					sToken3[nLen3 - 1] = '\0';
					for ( int nIndex = 0; nCode1 <= nCode2; ++nCode1, ++nIndex )
					{
						AddMapping( nCode1, sToken3 + 1, nLen3 - 2, nIndex);
					}
				}
				else
				{
					// TO DO: Error "Illegal entry in bfrange block in ToUnicode CMap"
				}
			}
			pLexer->GetToken( sToken1, sizeof(sToken1), &nLen1);
		}
		else
		{
			strcpy( sToken1, sToken2);
		}
	}
	delete pLexer;
}

void CharCodeToUnicode::AddMapping(CharCode nCode, char *sUnicodeString, int nLen, int nOffset)
{

	if ( nCode >= m_nMapLen )
	{
		unsigned int unOldLen = m_nMapLen;
		m_nMapLen = (nCode + 256) & ~255;
		m_pMap = (Unicode *)MemUtilsReallocArray( m_pMap, m_nMapLen, sizeof(Unicode));
		for ( unsigned int unIndex = unOldLen; unIndex < m_nMapLen; ++unIndex )
		{
			m_pMap[unIndex] = 0;
		}
	}
	if ( nLen <= 4 )
	{
		Unicode nUnicode = 0;
		if ( sscanf( sUnicodeString, "%x", &nUnicode) != 1)
		{
			// TO DO: Error Illegal entry in ToUnicode CMap"
			return;
		}
		m_pMap[nCode] = nUnicode + nOffset;
	}
	else
	{
		if ( m_nSMapLen >= m_nSMapSize )
		{
			m_nSMapSize = m_nSMapSize + 16;
			m_pSMap = (CharCodeToUnicodeString *) MemUtilsReallocArray( m_pSMap, m_nSMapSize, sizeof(CharCodeToUnicodeString));
		}
		m_pMap[nCode] = 0;
		m_pSMap[m_nSMapLen].nCode = nCode;
		m_pSMap[m_nSMapLen].nLen  = nLen / 4;
		for ( int nIndex = 0; nIndex < m_pSMap[m_nSMapLen].nLen && nIndex < MaxUnicodeString; ++nIndex )
		{
			char pUnicodeHex[5];
			strncpy( pUnicodeHex, sUnicodeString + nIndex * 4, 4);
			pUnicodeHex[4] = '\0';
			if ( sscanf( pUnicodeHex, "%x", &m_pSMap[m_nSMapLen].pUnicodeString[nIndex] ) != 1 )
			{
				// TO DO: Error "Illegal entry in ToUnicode CMap"
			}
		}
		m_pSMap[m_nSMapLen].pUnicodeString[m_pSMap[m_nSMapLen].nLen - 1] += nOffset;
		++m_nSMapLen;
	}
}

CharCodeToUnicode::CharCodeToUnicode(StringExt *seTag)
{
	m_seTag = seTag;
	m_nMapLen = 256;
	m_pMap = (Unicode *)MemUtilsMallocArray( m_nMapLen, sizeof(Unicode));
	for ( unsigned int unIndex = 0; unIndex < m_nMapLen; ++unIndex )
	{
		m_pMap[unIndex] = 0;
	}
	m_pSMap = NULL;
	m_nSMapLen = m_nSMapSize = 0;
	m_nRef = 1;

	InitializeCriticalSection( &m_oCS );
}

CharCodeToUnicode::CharCodeToUnicode(StringExt *seTag, Unicode *pMap, CharCode unMapLen, BOOL bCopyMap, CharCodeToUnicodeString *pSMap, int nSMapLen, int nSMapSize)
{
	m_seTag   = seTag;
	m_nMapLen = unMapLen;
	if ( bCopyMap )
	{
		m_pMap = (Unicode *)MemUtilsMallocArray(m_nMapLen, sizeof(Unicode));
		memcpy( m_pMap, pMap, m_nMapLen * sizeof(Unicode));
	}
	else
	{
		m_pMap = pMap;
	}
	m_pSMap     = pSMap;
	m_nSMapLen  = nSMapLen;
	m_nSMapSize = nSMapSize;

	m_nRef = 1;

	InitializeCriticalSection( &m_oCS );
}

CharCodeToUnicode::~CharCodeToUnicode()
{
	if ( m_seTag )
	{
		delete m_seTag;
	}
	MemUtilsFree(m_pMap);
	MemUtilsFree(m_pSMap);

	DeleteCriticalSection( &m_oCS );
}

void CharCodeToUnicode::AddRef()
{
	CTemporaryCS *pCS = new CTemporaryCS( &m_oCS );

	++m_nRef;

	RELEASEOBJECT( pCS );
}

void CharCodeToUnicode::Release()
{
	CTemporaryCS *pCS = new CTemporaryCS( &m_oCS );

	BOOL bDelete = ( --m_nRef == 0 );

	RELEASEOBJECT( pCS );

	if ( bDelete )
	{
		delete this;
	}
}

BOOL CharCodeToUnicode::Match(StringExt *seTag)
{
	return m_seTag && !m_seTag->Compare(seTag);
}

void CharCodeToUnicode::SetMapping(CharCode nCode, Unicode *pUnicode, int nLen)
{
	if ( nLen == 1 )
	{
		m_pMap[nCode] = pUnicode[0];
	}
	else
	{
		int nIndex = 0;
		for ( nIndex = 0; nIndex < m_nSMapLen; ++nIndex )
		{
			if ( m_pSMap[nIndex].nCode == nCode )
			{
				break;
			}
		}
		if ( nIndex == m_nSMapLen )
		{
			if (m_nSMapLen == m_nSMapSize)
			{
				m_nSMapSize += 8;
				m_pSMap = (CharCodeToUnicodeString *) MemUtilsReallocArray(m_pSMap, m_nSMapSize, sizeof(CharCodeToUnicodeString));
			}
			++m_nSMapLen;
		}
		m_pMap[nCode] = 0;
		m_pSMap[nIndex].nCode = nCode;
		m_pSMap[nIndex].nLen  = nLen;

		for ( int nJ = 0; nJ < nLen && nJ < MaxUnicodeString; ++nJ )
		{
			m_pSMap[nIndex].pUnicodeString[nJ] = pUnicode[nJ];
		}
	}
}

int CharCodeToUnicode::MapToUnicode(CharCode nCode, Unicode *pUnicode, int size)
{
	if ( nCode >= m_nMapLen )
	{
		return 0;
	}
	if ( m_pMap[nCode] )
	{
		pUnicode[0] = m_pMap[nCode];
		return 1;
	}
	for ( int nIndex = 0; nIndex < m_nSMapLen; ++nIndex )
	{
		if ( m_pSMap[nIndex].nCode == nCode )
		{
			int nJ = 0;
			for ( nJ = 0; nJ < m_pSMap[nIndex].nLen && nJ < size; ++nJ )
			{
				pUnicode[nJ] = m_pSMap[nIndex].pUnicodeString[nJ];
			}
			return nJ;
		}
	}
	return 0;
}

//-------------------------------------------------------------------------------------------------------------------------------

CharCodeToUnicodeCache::CharCodeToUnicodeCache(int nSize)
{
	m_nSize = nSize;
	m_ppCache = (CharCodeToUnicode **)MemUtilsMallocArray( m_nSize, sizeof(CharCodeToUnicode *));
	for ( int nIndex = 0; nIndex < m_nSize; ++nIndex )
	{
		m_ppCache[nIndex] = NULL;
	}
}

CharCodeToUnicodeCache::~CharCodeToUnicodeCache()
{
	for ( int nIndex = 0; nIndex < m_nSize; ++nIndex )
	{
		if ( m_ppCache[nIndex] )
		{
			m_ppCache[nIndex]->Release();
		}
	}
	MemUtilsFree(m_ppCache);
}

CharCodeToUnicode *CharCodeToUnicodeCache::GetCharCodeToUnicode(StringExt *seTag)
{
	if ( m_ppCache[0] && m_ppCache[0]->Match(seTag) )
	{
		m_ppCache[0]->AddRef();
		return m_ppCache[0];
	}

	for ( int nIndex = 1; nIndex < m_nSize; ++nIndex )
	{
		if ( m_ppCache[nIndex] && m_ppCache[nIndex]->Match(seTag) )
		{
			CharCodeToUnicode *pCharCodeToUnicode = m_ppCache[nIndex];
			for ( int nJ = nIndex; nJ >= 1; --nJ )
			{
				m_ppCache[nJ] = m_ppCache[nJ - 1];
			}
			m_ppCache[0] = pCharCodeToUnicode;
			pCharCodeToUnicode->AddRef();
			return pCharCodeToUnicode;
		}
	}
	return NULL;
}

void CharCodeToUnicodeCache::Add(CharCodeToUnicode *pCharCodeToUnicode)
{
	if ( m_ppCache[m_nSize - 1] )
	{
		m_ppCache[m_nSize - 1]->Release();
	}

	for ( int nIndex = m_nSize - 1; nIndex >= 1; --nIndex )
	{
		m_ppCache[nIndex] = m_ppCache[nIndex - 1];
	}
	m_ppCache[0] = pCharCodeToUnicode;
	pCharCodeToUnicode->AddRef();
}