mirror of
https://github.com/ONLYOFFICE/core.git
synced 2026-04-07 13:55:33 +08:00
Fix hyphen bugs & add tests
This commit is contained in:
@ -1,110 +1,149 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cctype>
|
||||
|
||||
#include "./../js/src/ExportedFunctions.h"
|
||||
#include "../../../../DesktopEditor/common/Directory.h"
|
||||
#include "../../../../DesktopEditor/fontengine/TextHyphen.h"
|
||||
#include "../../../../DesktopEditor/common/StringUTF32.h"
|
||||
|
||||
#define USE_DICTIONARIES_FROM_MEMORY
|
||||
|
||||
std::vector<std::wstring> ReadWords(const std::wstring& file)
|
||||
{
|
||||
std::vector<std::wstring> words;
|
||||
|
||||
NSFile::CFileBinary oFile;
|
||||
if (!oFile.OpenFile(file))
|
||||
return words;
|
||||
|
||||
int nInputDataLen = (int)oFile.GetFileSize();
|
||||
char* pInputData = new char[nInputDataLen];
|
||||
|
||||
oFile.ReadFile((BYTE*)pInputData, (DWORD)nInputDataLen);
|
||||
|
||||
int nPos = 0;
|
||||
if (nInputDataLen > 3 &&
|
||||
((BYTE)pInputData[0] == 0xEF) && ((BYTE)pInputData[1] == 0xBB) && ((BYTE)pInputData[2]) == 0xBF)
|
||||
{
|
||||
nPos = 3;
|
||||
}
|
||||
|
||||
while (nPos < nInputDataLen)
|
||||
{
|
||||
while (nPos < nInputDataLen && (pInputData[nPos] == '\n' || pInputData[nPos] == '\r'))
|
||||
++nPos;
|
||||
|
||||
int nStart = nPos;
|
||||
|
||||
while (nPos < nInputDataLen && (pInputData[nPos] != '\n' && pInputData[nPos] != '\r'))
|
||||
++nPos;
|
||||
|
||||
if (nPos > nStart)
|
||||
{
|
||||
std::string s(pInputData + nStart, nPos - nStart);
|
||||
words.push_back(UTF8_TO_U(s));
|
||||
}
|
||||
}
|
||||
|
||||
oFile.CloseFile();
|
||||
RELEASEARRAYOBJECTS(pInputData);
|
||||
return words;
|
||||
}
|
||||
|
||||
std::wstring HyphenWord(NSHyphen::CEngine& engine, const int& lang, const std::wstring& word)
|
||||
{
|
||||
std::string worda = U_TO_UTF8(word);
|
||||
char* result = engine.Process(lang, worda.c_str(), (int)worda.length());
|
||||
|
||||
NSStringUtils::CStringUTF32 oInput = word;
|
||||
NSStringUtils::CStringUTF32 oOutput;
|
||||
|
||||
int nCurrentIndex = 0;
|
||||
int nPosHyphen = 0;
|
||||
while (result[nPosHyphen] != 0)
|
||||
{
|
||||
if (1 == (result[nPosHyphen] & 1))
|
||||
{
|
||||
int nLenChunk = nPosHyphen - nCurrentIndex + 1;
|
||||
oOutput += oInput.substr(nCurrentIndex, nLenChunk);
|
||||
oOutput += '=';
|
||||
nCurrentIndex += nLenChunk;
|
||||
}
|
||||
++nPosHyphen;
|
||||
}
|
||||
|
||||
if (nCurrentIndex < oInput.length())
|
||||
oOutput += oInput.substr(nCurrentIndex, oInput.length() - nCurrentIndex);
|
||||
|
||||
return oOutput.ToStdWString();
|
||||
}
|
||||
|
||||
int GetLanguage(const std::wstring& sLanguage)
|
||||
{
|
||||
std::string sLang = U_TO_UTF8(sLanguage);
|
||||
int nLang = 0;
|
||||
for (int j = 0; j < NSTextLanguages::DictionaryRec_count; ++j)
|
||||
{
|
||||
if (std::string(NSTextLanguages::Dictionaries[j].m_name) == sLang)
|
||||
{
|
||||
nLang = NSTextLanguages::Dictionaries[j].m_lang;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return nLang;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
HyphenDict *dict;
|
||||
|
||||
std::string dict_filename = PRO_DIR;
|
||||
std::string words_filename = PRO_DIR;
|
||||
std::string result_filename = PRO_DIR;
|
||||
std::string dict_name = "en_US";
|
||||
|
||||
// set your filenames here
|
||||
dict_filename += ("../../../../../dictionaries/" + dict_name + "/hyph_" + dict_name + ".dic");
|
||||
words_filename += "words.txt";
|
||||
result_filename += "result.txt";
|
||||
|
||||
// load the hyphenation dictionary
|
||||
dict = hnj_hyphen_load(dict_filename.c_str());
|
||||
|
||||
std::ifstream fin(words_filename);
|
||||
if(!fin.is_open())
|
||||
{
|
||||
std::cerr << "could not open " << words_filename << "!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::ofstream fout(result_filename);
|
||||
if(!fout.is_open())
|
||||
{
|
||||
std::cerr << "could not open " << result_filename << "!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!fin.eof())
|
||||
{
|
||||
char **rep = NULL;
|
||||
int *pos = NULL;
|
||||
int *cut = NULL;
|
||||
|
||||
std::string word;
|
||||
|
||||
fin >> word;
|
||||
int n = word.size();
|
||||
char *hword = new char[n * 2];
|
||||
char *hyphens = new char[n + 5];
|
||||
|
||||
/**
|
||||
* @brief
|
||||
* input data:
|
||||
*
|
||||
* word: input word
|
||||
* word_size: byte length of the input word
|
||||
* hyphens: allocated character buffer (size = word_size + 5)
|
||||
* hyphenated_word: allocated character buffer (size ~ word_size * 2) or NULL
|
||||
* rep, pos, cut: pointers (point to the allocated and _zeroed_ buffers
|
||||
* (size=word_size) or with NULL value) or NULL
|
||||
*
|
||||
* output data:
|
||||
*
|
||||
* hyphens: hyphenation vector (hyphenation points signed with odd numbers).
|
||||
* hyphenated_word: hyphenated input word (hyphens signed with `=').
|
||||
* optional (NULL input).
|
||||
* rep: NULL (only standard hyph.), or replacements (hyphenation points
|
||||
* signed with `=' in replacements).
|
||||
* pos: NULL, or difference of the actual position and the beginning
|
||||
* positions of the change in input words.
|
||||
* cut: NULL, or counts of the removed characters of the original words
|
||||
* at hyphenation.
|
||||
*
|
||||
* Note: rep, pos, cut are complementary arrays to the hyphens, indexed with the
|
||||
* character positions of the input word.
|
||||
*/
|
||||
hnj_hyphen_hyphenate2(dict, word.c_str(), n, hyphens, hword, &rep, &pos, &cut);
|
||||
|
||||
fout << hword << ' ';
|
||||
|
||||
delete[] hword;
|
||||
delete[] hyphens;
|
||||
}
|
||||
fin.close();
|
||||
fout.close();
|
||||
|
||||
#if 1
|
||||
|
||||
CHyphenApplication* pApplication = hyphenCreateApplication();
|
||||
|
||||
FILE* fDictionary = fopen(dict_filename.c_str(), "rb");
|
||||
fseek(fDictionary, 0, SEEK_END);
|
||||
long lDictSize = ftell(fDictionary);
|
||||
fseek(fDictionary, 0, SEEK_SET); /* same as rewind(f); */
|
||||
|
||||
char* pDictData = (char*)malloc(lDictSize);
|
||||
fread(pDictData, (size_t)lDictSize, 1, fDictionary);
|
||||
fclose(fDictionary);
|
||||
|
||||
int nResult = hyphenLoadDictionary(pApplication, pDictData, (unsigned int)lDictSize, dict_name.c_str());
|
||||
|
||||
free(pDictData);
|
||||
|
||||
char* pHyphenVector = hyphenWord(pApplication, "expedition", dict_name.c_str());
|
||||
|
||||
hyphenDestroyApplication(pApplication);
|
||||
NSHyphen::CEngine engine;
|
||||
std::wstring dictionaries_dir = NSFile::GetProcessDirectory() + L"/../../../../../../dictionaries";
|
||||
engine.Init(dictionaries_dir);
|
||||
|
||||
#if 0
|
||||
std::wstring sOneWord = HyphenWord(engine, 1033, L"expedition");
|
||||
#endif
|
||||
|
||||
std::wstring input_dir = NSFile::GetProcessDirectory() + L"/input";
|
||||
std::wstring output_dir = NSFile::GetProcessDirectory() + L"/output";
|
||||
std::vector<std::wstring> langs = NSDirectory::GetFiles(input_dir);
|
||||
|
||||
for (std::wstring& lang : langs)
|
||||
{
|
||||
std::wstring sLang = NSFile::GetFileName(lang);
|
||||
int nLang = GetLanguage(sLang);
|
||||
if (nLang == 0)
|
||||
continue;
|
||||
|
||||
std::vector<std::wstring> words = ReadWords(lang);
|
||||
if (words.empty())
|
||||
continue;
|
||||
|
||||
#ifdef USE_DICTIONARIES_FROM_MEMORY
|
||||
BYTE* pData = NULL;
|
||||
DWORD dwDataLen = 0;
|
||||
std::wstring sFileDict = dictionaries_dir + L"/" + sLang + L"/hyph_" + sLang + L".dic";
|
||||
if (NSFile::CFileBinary::ReadAllBytes(sFileDict, &pData, dwDataLen))
|
||||
{
|
||||
engine.LoadDictionary(nLang, pData, (unsigned int)dwDataLen);
|
||||
RELEASEARRAYOBJECTS(pData);
|
||||
}
|
||||
#endif
|
||||
|
||||
std::wstring sResult;
|
||||
for (std::wstring& word : words)
|
||||
{
|
||||
sResult += HyphenWord(engine, nLang, word);
|
||||
sResult += L"\r\n";
|
||||
}
|
||||
|
||||
std::wstring sOutputFile = output_dir + L"/" + sLang;
|
||||
|
||||
if (NSFile::CFileBinary::Exists(sOutputFile))
|
||||
NSFile::CFileBinary::Remove(sOutputFile);
|
||||
|
||||
NSFile::CFileBinary::SaveToFile(sOutputFile, sResult, true);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -11,20 +11,7 @@ CORE_ROOT_DIR = $$PWD/../../../../../core
|
||||
PWD_ROOT_DIR = $$PWD
|
||||
|
||||
include($$CORE_ROOT_DIR/Common/base.pri)
|
||||
|
||||
INCLUDEPATH += $$PWD_ROOT_DIR/../hyphen
|
||||
|
||||
DEFINES += PRO_DIR=\\\"$$PWD/\\\"
|
||||
|
||||
HEADERS += $$PWD_ROOT_DIR/../hyphen/hyphen.h
|
||||
HEADERS += $$PWD_ROOT_DIR/../hyphen/hnjalloc.h
|
||||
|
||||
#SOURCES += $$PWD_ROOT_DIR/../hyphen/hyphen.c
|
||||
SOURCES += $$PWD_ROOT_DIR/../hyphen/hnjalloc.c
|
||||
|
||||
SOURCES += \
|
||||
../js/src/ExportedFunctions.cpp \
|
||||
../js/src/HyphenApplication.cpp
|
||||
ADD_DEPENDENCY(UnicodeConverter, kernel, graphics)
|
||||
|
||||
SOURCES += main.cpp
|
||||
|
||||
|
||||
Reference in New Issue
Block a user