From cce60166a413b3a7cac32f68c86918512becd461 Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 6 Feb 2024 03:42:48 +0300 Subject: [PATCH] Add extract mode in x2ttester --- Test/Applications/x2tTester/README.md | 21 +- Test/Applications/x2tTester/x2tTester.cpp | 245 +++++++++++++++++++--- Test/Applications/x2tTester/x2tTester.h | 42 +++- 3 files changed, 271 insertions(+), 37 deletions(-) diff --git a/Test/Applications/x2tTester/README.md b/Test/Applications/x2tTester/README.md index f8a03581a4..1b35855423 100644 --- a/Test/Applications/x2tTester/README.md +++ b/Test/Applications/x2tTester/README.md @@ -1,7 +1,9 @@ CONFIGURATION ============= -You need to create an xml configuration file. It must contain: +## Default conversion + +You need to create an xml configuration file. It contains: # root of xml @@ -95,8 +97,23 @@ You need to create an xml configuration file. It must contain: docx txt pptx xlsx txt doc pdf +## Extraction +x2ttester can extract files with the required output extension instead of default x2t conversion. Set extraction mode: -You can use the following templates: + (non-required) sets extraction mode (default - "0") + + +When `extract` is "1", you can set the `output` parameter to determine which exts will be extracted. Default `output` is `emf wmf`. +Params `input`, `inputDirectory`, `outputDirectory`, `cores` works the same. + +Extract mode has additional options: + + (non-required) converts non-zip office files into docx (e.g. pdf) (default - "0"). + + +The conversion params in `convertBeforeExtract` are the same as the default conversion. + +## Templates # main xml config diff --git a/Test/Applications/x2tTester/x2tTester.cpp b/Test/Applications/x2tTester/x2tTester.cpp index b594a8ae4b..cbbdc6a451 100644 --- a/Test/Applications/x2tTester/x2tTester.cpp +++ b/Test/Applications/x2tTester/x2tTester.cpp @@ -119,22 +119,23 @@ std::vector CFormatsList::GetAllExts() const { std::vector all_formats; - for(auto& val : m_documents) + for (const auto& val : m_documents) all_formats.push_back(val); - for(auto& val : m_presentations) + for (const auto& val : m_presentations) all_formats.push_back(val); - for(auto& val : m_spreadsheets) + for (const auto& val : m_spreadsheets) all_formats.push_back(val); - for(auto& val : m_images) + for (const auto& val : m_images) all_formats.push_back(val); - for(auto& val : m_crossplatform) + for (const auto& val : m_crossplatform) all_formats.push_back(val); - all_formats.push_back(m_pdf); + if (!m_pdf.empty()) + all_formats.push_back(m_pdf); return all_formats; } @@ -259,6 +260,16 @@ CFormatsList CFormatsList::GetOutputExts() return list; } +CFormatsList CFormatsList::GetExtractExts() +{ + CFormatsList list; + + list.m_images.push_back(L"emf"); + list.m_images.push_back(L"wmf"); + + return list; +} + Cx2tTester::Cx2tTester(const std::wstring& configPath) { m_bIsUseSystemFonts = true; @@ -269,14 +280,22 @@ Cx2tTester::Cx2tTester(const std::wstring& configPath) m_bIsFilenamePassword = true; m_bTroughConversion = false; m_bSaveEnvironment = false; + + m_bExtract = false; + m_bConvertBeforeExtract = false; + m_defaultCsvDelimiter = L";"; m_defaultCsvTxtEndcoding = L"UTF-8"; m_inputFormatsList = CFormatsList::GetDefaultExts(); m_outputFormatsList = CFormatsList::GetOutputExts(); + m_extractFormatsList = CFormatsList::GetExtractExts(); m_timeout = 5 * 60; // 5 min + SetConfig(configPath); + m_errorsXmlDirectory = m_outputDirectory + FILE_SEPARATOR_STR + L"_errors"; m_troughConversionDirectory = m_outputDirectory + FILE_SEPARATOR_STR + L"_t"; + m_tempDirectory = m_outputDirectory + FILE_SEPARATOR_STR + L"_temp"; m_fontsDirectory = NSFile::GetProcessDirectory() + FILE_SEPARATOR_STR + L"fonts"; @@ -335,6 +354,12 @@ Cx2tTester::~Cx2tTester() m_reportCS.DeleteCriticalSection(); m_outputCS.DeleteCriticalSection(); m_reportStream.CloseFile(); + + for(auto&& val : m_deleteLaterFiles) + NSFile::CFileBinary::Remove(val); + + for(auto&& val : m_deleteLaterDirectories) + NSDirectory::DeleteDirectory(val); } void Cx2tTester::SetConfig(const std::wstring& configPath) @@ -366,6 +391,8 @@ void Cx2tTester::SetConfig(const std::wstring& configPath) else if(name == L"troughConversion" && !node.GetText().empty()) m_bTroughConversion = std::stoi(node.GetText()); else if(name == L"saveEnvironment" && !node.GetText().empty()) m_bSaveEnvironment = std::stoi(node.GetText()); else if(name == L"defaultCsvTxtEncoding" && !node.GetText().empty()) m_defaultCsvTxtEndcoding = node.GetText(); + else if(name == L"extract" && !node.GetText().empty()) m_bExtract = std::stoi(node.GetText()); + else if(name == L"convertBeforeExtract" && !node.GetText().empty()) m_bConvertBeforeExtract = std::stoi(node.GetText()); else if(name == L"defaultCsvDelimiter" && !node.GetText().empty()) m_defaultCsvDelimiter = (wchar_t)std::stoi(node.GetText(), nullptr, 16); else if(name == L"inputFilesList" && !node.GetText().empty()) { @@ -418,17 +445,39 @@ void Cx2tTester::SetConfig(const std::wstring& configPath) exit(-1); } - if(default_input_formats) + if (default_input_formats) m_inputExts = m_inputFormatsList.GetAllExts(); - if(default_output_formats) - m_outputExts = m_outputFormatsList.GetAllExts(); + if (default_output_formats) + { + if (m_bExtract) + m_outputExts = m_extractFormatsList.GetAllExts(); + else + m_outputExts = m_outputFormatsList.GetAllExts(); + } + } void Cx2tTester::Start() { // setup timer m_timeStart = NSTimers::GetTickCount(); + m_outputDirectory = CorrectPathW(m_outputDirectory); + m_errorsXmlDirectory = CorrectPathW(m_errorsXmlDirectory); + m_troughConversionDirectory = CorrectPathW(m_troughConversionDirectory); + + // setup & clear output folder + if(NSDirectory::Exists(m_outputDirectory)) + NSDirectory::DeleteDirectory(m_outputDirectory); + + NSDirectory::CreateDirectory(m_outputDirectory); + + // setup & clear errors folder + if(NSDirectory::Exists(m_errorsXmlDirectory)) + NSDirectory::DeleteDirectory(m_errorsXmlDirectory); + + NSDirectory::CreateDirectory(m_errorsXmlDirectory); + // check fonts CApplicationFontsWorker fonts_worker; fonts_worker.m_sDirectory = m_fontsDirectory; @@ -449,23 +498,6 @@ void Cx2tTester::Start() NSFonts::IApplicationFonts* pFonts = fonts_worker.Check(); RELEASEINTERFACE(pFonts); - m_outputDirectory = CorrectPathW(m_outputDirectory); - m_errorsXmlDirectory = CorrectPathW(m_errorsXmlDirectory); - m_troughConversionDirectory = CorrectPathW(m_troughConversionDirectory); - - // setup & clear output folder - if(NSDirectory::Exists(m_outputDirectory)) - NSDirectory::DeleteDirectory(m_outputDirectory); - - NSDirectory::CreateDirectory(m_outputDirectory); - - // setup & clear errors folder - if(NSDirectory::Exists(m_errorsXmlDirectory)) - NSDirectory::DeleteDirectory(m_errorsXmlDirectory); - - NSDirectory::CreateDirectory(m_errorsXmlDirectory); - - std::vector files = NSDirectory::GetFiles(m_inputDirectory, true); for(int i = 0; i < files.size(); i++) { @@ -486,6 +518,54 @@ void Cx2tTester::Start() if(files.size() < m_maxProc) m_maxProc = files.size(); + if (m_bExtract) + { + COfficeFileFormatChecker checker; + COfficeUtils utils; + std::vector files_to_convert; + + for (size_t i = 0; i < files.size(); i++) + if (utils.IsArchive(files[i]) == S_FALSE && checker.isOfficeFile(files[i])) + { + if (m_bConvertBeforeExtract) + files_to_convert.push_back(files[i]); + files.erase(files.begin() + i); + } + + if (!files_to_convert.empty()) + { + if(NSDirectory::Exists(m_tempDirectory)) + NSDirectory::DeleteDirectory(m_tempDirectory); + + NSDirectory::CreateDirectories(m_tempDirectory); + + auto copy_inputDirectory = m_inputDirectory; + auto copy_outputDirectory = m_outputDirectory; + auto copy_outputExts = m_outputExts; + + m_outputDirectory = m_tempDirectory; + m_outputExts = {L"docx"}; + + Convert(files_to_convert, true, true); + + m_outputDirectory = copy_outputDirectory; + m_outputExts = copy_outputExts; + + m_inputDirectory = m_tempDirectory; + std::vector temp_files = NSDirectory::GetFiles(m_tempDirectory, true); + Extract(temp_files); + + m_inputDirectory = copy_inputDirectory; + } + + Extract(files); + + if(NSDirectory::Exists(m_tempDirectory)) + NSDirectory::DeleteDirectory(m_tempDirectory); + + return; + } + // conversion in _t directory -> _t directory to output if(m_bTroughConversion) { @@ -512,12 +592,6 @@ void Cx2tTester::Start() Convert(files); WriteTime(); - - for(auto&& val : m_deleteLaterFiles) - NSFile::CFileBinary::Remove(val); - - for(auto&& val : m_deleteLaterDirectories) - NSDirectory::DeleteDirectory(val); } void Cx2tTester::Convert(const std::vector& files, bool bNoDirectory, bool bTrough) @@ -652,6 +726,47 @@ void Cx2tTester::Convert(const std::vector& files, bool bNoDirecto while(!IsAllFree()) NSThreads::Sleep(150); } +void Cx2tTester::Extract(const std::vector& files) +{ + for (int i = 0; i < files.size(); i++) + { + const std::wstring& input_file = files[i]; + std::wstring input_filename = NSFile::GetFileName(input_file); + std::wstring input_file_directory = NSFile::GetDirectoryName(input_file); + std::wstring input_subfolders = input_file_directory.substr(m_inputDirectory.size(), + input_file_directory.size() - m_inputDirectory.size()); + std::wstring output_files_directory = m_outputDirectory + input_subfolders + FILE_SEPARATOR_STR + input_filename; + + if(!NSDirectory::Exists(output_files_directory)) + NSDirectory::CreateDirectories(output_files_directory); + + // waiting... + do + { + NSThreads::Sleep(50); + } while(IsAllBusy()); + + m_coresCS.Enter(); + + // setup & start new extractor + CExtractor *extractor = new CExtractor(this); + extractor->SetInputFile(input_file); + extractor->SetOutputFilesDirectory(output_files_directory); + extractor->SetExtractExts(m_outputExts); + extractor->SetFilesCount(files.size(), i + 1); + extractor->DestroyOnFinish(); + m_currentProc++; + + m_coresCS.Leave(); + + extractor->Start(0); + } + + // waiting all procs end + while(!IsAllFree()) + NSThreads::Sleep(150); +} + void Cx2tTester::WriteReportHeader() { CTemporaryCS CS(&m_reportCS); @@ -844,7 +959,7 @@ DWORD CConverter::ThreadProc() for(int i = 0; i < m_outputExts.size(); i++) { std::wstring output_ext = L"."+ m_outputExts[i]; - int output_format = checker.GetFormatByExtension(output_ext); + int output_format = m_checker.GetFormatByExtension(output_ext); std::wstring xml_params_filename = input_filename + L"_" + output_ext + L".xml"; std::wstring xml_params_file = m_outputFilesDirectory + FILE_SEPARATOR_STR + xml_params_filename; @@ -1077,4 +1192,68 @@ DWORD CConverter::ThreadProc() return 0; } +CExtractor::CExtractor(Cx2tTester* internal) : m_internal(internal) +{ +} +CExtractor::~CExtractor() +{ + Stop(); +} +void CExtractor::SetInputFile(const std::wstring& inputFile) +{ + m_inputFile = inputFile; +} +void CExtractor::SetOutputFilesDirectory(const std::wstring& outputFilesDirectory) +{ + m_outputFilesDirectory = outputFilesDirectory; +} +void CExtractor::SetExtractExts(const std::vector& extractExts) +{ + m_extractExts = extractExts; +} +void CExtractor::SetFilesCount(int totalFiles, int currFile) +{ + m_totalFiles = totalFiles; + m_currFile = currFile; +} + +DWORD CExtractor::ThreadProc() +{ + std::wstring input_filename = NSFile::GetFileName(m_inputFile); + std::wstring input_ext = L'.' + NSFile::GetFileExtention(input_filename); + std::wstring input_filename_no_ext = input_filename.substr(0, input_filename.size() - input_ext.size()); + + for (size_t i = 0; i < m_extractExts.size(); i++) + { + const std::wstring& extract_ext = m_extractExts[i]; + std::wstring output_folder = m_outputFilesDirectory + FILE_SEPARATOR_STR + extract_ext; + + if (NSDirectory::Exists(output_folder)) + NSDirectory::DeleteDirectory(output_folder); + + NSDirectory::CreateDirectories(output_folder); + + std::wstring temp_folder = NSDirectory::CreateDirectoryWithUniqueName(output_folder); + m_utils.ExtractToDirectory(m_inputFile, temp_folder, nullptr, false); + + auto unzip_files = NSDirectory::GetFiles(temp_folder, true); + bool delete_empty = true; + for (const auto& file : unzip_files) + { + if (NSFile::GetFileExtention(file) == m_extractExts[i]) + { + delete_empty = false; + NSFile::CFileBinary::Move(file, output_folder + FILE_SEPARATOR_STR +NSFile::GetFileName(file)); + } + } + if (delete_empty) + NSDirectory::DeleteDirectory(output_folder); + NSDirectory::DeleteDirectory(temp_folder); + } + if (NSDirectory::GetFilesCount(m_outputFilesDirectory, true) == 0) + NSDirectory::DeleteDirectory(m_outputFilesDirectory); + + m_internal->m_currentProc--; + return 0; +} diff --git a/Test/Applications/x2tTester/x2tTester.h b/Test/Applications/x2tTester/x2tTester.h index 47dfac8605..01ce96ada2 100644 --- a/Test/Applications/x2tTester/x2tTester.h +++ b/Test/Applications/x2tTester/x2tTester.h @@ -55,7 +55,10 @@ public: static CFormatsList GetDefaultExts(); // all writable exts - static CFormatsList GetOutputExts(); + static CFormatsList GetOutputExts(); + + // default exts to extract + static CFormatsList GetExtractExts(); private: std::vector m_documents; @@ -111,6 +114,7 @@ private: // parse string like "docx txt" into vector std::vector ParseExtensionsString(std::wstring extensions, const CFormatsList& fl); void Convert(const std::vector& files, bool bNoDirectory = false, bool bTrough = false); + void Extract(const std::vector& files); // takes from config std::wstring m_reportFile; @@ -121,6 +125,7 @@ private: std::wstring m_errorsXmlDirectory; std::wstring m_troughConversionDirectory; std::wstring m_fontsDirectory; + std::wstring m_tempDirectory; // fonts bool m_bIsUseSystemFonts; @@ -137,6 +142,7 @@ private: // lists CFormatsList m_inputFormatsList; CFormatsList m_outputFormatsList; + CFormatsList m_extractFormatsList; bool m_bIsErrorsOnly; bool m_bIsTimestamp; @@ -157,6 +163,12 @@ private: std::vector m_deleteLaterFiles; std::vector m_deleteLaterDirectories; + + // extract files with output_ext from input_files + bool m_bExtract; + + // convert to docx before extract + bool m_bConvertBeforeExtract; }; // generates temp xml, convert, calls m_internal->writeReport @@ -194,7 +206,7 @@ private: std::wstring m_inputExt; std::wstring m_fontsDirectory; - COfficeFileFormatChecker checker; + COfficeFileFormatChecker m_checker; std::wstring m_x2tPath; std::wstring m_errorsXmlDirectory; @@ -214,4 +226,30 @@ private: unsigned long m_timeout; }; +// extracts files from office files +class CExtractor : public NSThreads::CBaseThread +{ +public: + CExtractor(Cx2tTester* internal); + virtual ~CExtractor(); + + void SetInputFile(const std::wstring& inputFile); + void SetOutputFilesDirectory(const std::wstring& outputFilesDirectory); + void SetExtractExts(const std::vector& extractExts); + void SetFilesCount(int totalFiles, int currFile); + + virtual DWORD ThreadProc(); + +private: + Cx2tTester* m_internal; + std::wstring m_inputFile; + std::wstring m_outputFilesDirectory; + std::vector m_extractExts; + COfficeUtils m_utils; + + int m_totalFiles; + int m_currFile; + +}; + #endif // X2T_TESTER_H