Fix hyphen bugs & add tests

This commit is contained in:
Oleg Korshul
2023-10-12 22:46:23 +03:00
parent aca346e32b
commit 4bd9864129
6 changed files with 170 additions and 177 deletions

View File

@ -1,110 +1,149 @@
#include <fstream>
#include <iostream>
#include <vector>
#include <cctype>
#include "./../js/src/ExportedFunctions.h"
#include "../../../../DesktopEditor/common/Directory.h"
#include "../../../../DesktopEditor/fontengine/TextHyphen.h"
#include "../../../../DesktopEditor/common/StringUTF32.h"
#define USE_DICTIONARIES_FROM_MEMORY
std::vector<std::wstring> ReadWords(const std::wstring& file)
{
std::vector<std::wstring> words;
NSFile::CFileBinary oFile;
if (!oFile.OpenFile(file))
return words;
int nInputDataLen = (int)oFile.GetFileSize();
char* pInputData = new char[nInputDataLen];
oFile.ReadFile((BYTE*)pInputData, (DWORD)nInputDataLen);
int nPos = 0;
if (nInputDataLen > 3 &&
((BYTE)pInputData[0] == 0xEF) && ((BYTE)pInputData[1] == 0xBB) && ((BYTE)pInputData[2]) == 0xBF)
{
nPos = 3;
}
while (nPos < nInputDataLen)
{
while (nPos < nInputDataLen && (pInputData[nPos] == '\n' || pInputData[nPos] == '\r'))
++nPos;
int nStart = nPos;
while (nPos < nInputDataLen && (pInputData[nPos] != '\n' && pInputData[nPos] != '\r'))
++nPos;
if (nPos > nStart)
{
std::string s(pInputData + nStart, nPos - nStart);
words.push_back(UTF8_TO_U(s));
}
}
oFile.CloseFile();
RELEASEARRAYOBJECTS(pInputData);
return words;
}
std::wstring HyphenWord(NSHyphen::CEngine& engine, const int& lang, const std::wstring& word)
{
std::string worda = U_TO_UTF8(word);
char* result = engine.Process(lang, worda.c_str(), (int)worda.length());
NSStringUtils::CStringUTF32 oInput = word;
NSStringUtils::CStringUTF32 oOutput;
int nCurrentIndex = 0;
int nPosHyphen = 0;
while (result[nPosHyphen] != 0)
{
if (1 == (result[nPosHyphen] & 1))
{
int nLenChunk = nPosHyphen - nCurrentIndex + 1;
oOutput += oInput.substr(nCurrentIndex, nLenChunk);
oOutput += '=';
nCurrentIndex += nLenChunk;
}
++nPosHyphen;
}
if (nCurrentIndex < oInput.length())
oOutput += oInput.substr(nCurrentIndex, oInput.length() - nCurrentIndex);
return oOutput.ToStdWString();
}
int GetLanguage(const std::wstring& sLanguage)
{
std::string sLang = U_TO_UTF8(sLanguage);
int nLang = 0;
for (int j = 0; j < NSTextLanguages::DictionaryRec_count; ++j)
{
if (std::string(NSTextLanguages::Dictionaries[j].m_name) == sLang)
{
nLang = NSTextLanguages::Dictionaries[j].m_lang;
break;
}
}
return nLang;
}
int main(int argc, char *argv[])
{
HyphenDict *dict;
std::string dict_filename = PRO_DIR;
std::string words_filename = PRO_DIR;
std::string result_filename = PRO_DIR;
std::string dict_name = "en_US";
// set your filenames here
dict_filename += ("../../../../../dictionaries/" + dict_name + "/hyph_" + dict_name + ".dic");
words_filename += "words.txt";
result_filename += "result.txt";
// load the hyphenation dictionary
dict = hnj_hyphen_load(dict_filename.c_str());
std::ifstream fin(words_filename);
if(!fin.is_open())
{
std::cerr << "could not open " << words_filename << "!" << std::endl;
return -1;
}
std::ofstream fout(result_filename);
if(!fout.is_open())
{
std::cerr << "could not open " << result_filename << "!" << std::endl;
return -1;
}
while(!fin.eof())
{
char **rep = NULL;
int *pos = NULL;
int *cut = NULL;
std::string word;
fin >> word;
int n = word.size();
char *hword = new char[n * 2];
char *hyphens = new char[n + 5];
/**
* @brief
* input data:
*
* word: input word
* word_size: byte length of the input word
* hyphens: allocated character buffer (size = word_size + 5)
* hyphenated_word: allocated character buffer (size ~ word_size * 2) or NULL
* rep, pos, cut: pointers (point to the allocated and _zeroed_ buffers
* (size=word_size) or with NULL value) or NULL
*
* output data:
*
* hyphens: hyphenation vector (hyphenation points signed with odd numbers).
* hyphenated_word: hyphenated input word (hyphens signed with `=').
* optional (NULL input).
* rep: NULL (only standard hyph.), or replacements (hyphenation points
* signed with `=' in replacements).
* pos: NULL, or difference of the actual position and the beginning
* positions of the change in input words.
* cut: NULL, or counts of the removed characters of the original words
* at hyphenation.
*
* Note: rep, pos, cut are complementary arrays to the hyphens, indexed with the
* character positions of the input word.
*/
hnj_hyphen_hyphenate2(dict, word.c_str(), n, hyphens, hword, &rep, &pos, &cut);
fout << hword << ' ';
delete[] hword;
delete[] hyphens;
}
fin.close();
fout.close();
#if 1
CHyphenApplication* pApplication = hyphenCreateApplication();
FILE* fDictionary = fopen(dict_filename.c_str(), "rb");
fseek(fDictionary, 0, SEEK_END);
long lDictSize = ftell(fDictionary);
fseek(fDictionary, 0, SEEK_SET); /* same as rewind(f); */
char* pDictData = (char*)malloc(lDictSize);
fread(pDictData, (size_t)lDictSize, 1, fDictionary);
fclose(fDictionary);
int nResult = hyphenLoadDictionary(pApplication, pDictData, (unsigned int)lDictSize, dict_name.c_str());
free(pDictData);
char* pHyphenVector = hyphenWord(pApplication, "expedition", dict_name.c_str());
hyphenDestroyApplication(pApplication);
NSHyphen::CEngine engine;
std::wstring dictionaries_dir = NSFile::GetProcessDirectory() + L"/../../../../../../dictionaries";
engine.Init(dictionaries_dir);
#if 0
std::wstring sOneWord = HyphenWord(engine, 1033, L"expedition");
#endif
std::wstring input_dir = NSFile::GetProcessDirectory() + L"/input";
std::wstring output_dir = NSFile::GetProcessDirectory() + L"/output";
std::vector<std::wstring> langs = NSDirectory::GetFiles(input_dir);
for (std::wstring& lang : langs)
{
std::wstring sLang = NSFile::GetFileName(lang);
int nLang = GetLanguage(sLang);
if (nLang == 0)
continue;
std::vector<std::wstring> words = ReadWords(lang);
if (words.empty())
continue;
#ifdef USE_DICTIONARIES_FROM_MEMORY
BYTE* pData = NULL;
DWORD dwDataLen = 0;
std::wstring sFileDict = dictionaries_dir + L"/" + sLang + L"/hyph_" + sLang + L".dic";
if (NSFile::CFileBinary::ReadAllBytes(sFileDict, &pData, dwDataLen))
{
engine.LoadDictionary(nLang, pData, (unsigned int)dwDataLen);
RELEASEARRAYOBJECTS(pData);
}
#endif
std::wstring sResult;
for (std::wstring& word : words)
{
sResult += HyphenWord(engine, nLang, word);
sResult += L"\r\n";
}
std::wstring sOutputFile = output_dir + L"/" + sLang;
if (NSFile::CFileBinary::Exists(sOutputFile))
NSFile::CFileBinary::Remove(sOutputFile);
NSFile::CFileBinary::SaveToFile(sOutputFile, sResult, true);
}
return 0;
}

View File

@ -11,20 +11,7 @@ CORE_ROOT_DIR = $$PWD/../../../../../core
PWD_ROOT_DIR = $$PWD
include($$CORE_ROOT_DIR/Common/base.pri)
INCLUDEPATH += $$PWD_ROOT_DIR/../hyphen
DEFINES += PRO_DIR=\\\"$$PWD/\\\"
HEADERS += $$PWD_ROOT_DIR/../hyphen/hyphen.h
HEADERS += $$PWD_ROOT_DIR/../hyphen/hnjalloc.h
#SOURCES += $$PWD_ROOT_DIR/../hyphen/hyphen.c
SOURCES += $$PWD_ROOT_DIR/../hyphen/hnjalloc.c
SOURCES += \
../js/src/ExportedFunctions.cpp \
../js/src/HyphenApplication.cpp
ADD_DEPENDENCY(UnicodeConverter, kernel, graphics)
SOURCES += main.cpp