VOICEVOX · y-chan · Mar 15, 2022 · Mar 13, 2022 · Mar 13, 2022 · Mar 13, 2022
diff --git a/core/src/core.h b/core/src/core.h
@@ -29,8 +29,8 @@ extern "C" {
 typedef enum {
   // 成功
   VOICEVOX_RESULT_SUCCEED = 0,
-  // OpenJTalk初期化に失敗した
-  VOICEVOX_RESULT_NOT_INITIALIZE_OPEN_JTALK_ERR = 1,
+  // OpenJTalk辞書がロードされていない
+  VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT = 1,
 } VoicevoxResultCode;
 /**
  * @fn
@@ -125,10 +125,10 @@ VOICEVOX_CORE_API const char *last_error_message();
 
 /**
  * @fn
- * open jtalkを初期化する
+ * open jtalkの辞書を読み込む
  * @return 結果コード
  */
-VOICEVOX_CORE_API VoicevoxResultCode voicevox_initialize_openjtalk(const char *dict_path);
+VOICEVOX_CORE_API VoicevoxResultCode voicevox_load_openjtalk_dict(const char *dict_path);
 
 /**
  * @fn

diff --git a/core/src/engine.cpp b/core/src/engine.cpp
@@ -5,35 +5,29 @@
 
 #include "core.h"
 #include "engine/model.h"
-#include "engine/openjtalk.h"
 #include "engine/synthesis_engine.h"
 
 using namespace voicevox::core::engine;
 
-// TODO:SynthesisEngineにopenjtalkを持たせるためshared_ptrにしているが、やめたい
-static std::shared_ptr<OpenJTalk> openjtalk;
-static std::unique_ptr<SynthesisEngine> engine;
+static SynthesisEngine engine;
 
-VoicevoxResultCode voicevox_initialize_openjtalk(const char *dict_path) {
+VoicevoxResultCode voicevox_load_openjtalk_dict(const char *dict_path) {
   // TODO: error handling
-  openjtalk = std::make_shared<OpenJTalk>(dict_path);
+  engine.load_openjtalk_dict(dict_path);
   return VOICEVOX_RESULT_SUCCEED;
 }
 
 VoicevoxResultCode voicevox_tts(const char *text, int64_t speaker_id, int *output_binary_size, uint8_t **output_wav) {
-  if (!openjtalk) {
-    return VOICEVOX_RESULT_NOT_INITIALIZE_OPEN_JTALK_ERR;
-  }
-  if (!engine) {
-    engine = std::make_unique<SynthesisEngine>(openjtalk);
+  if (!engine.is_openjtalk_dict_loaded()) {
+    return VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT;
   }
 
-  std::vector<AccentPhraseModel> accent_phrases = engine->create_accent_phrases(std::string(text), &speaker_id);
+  std::vector<AccentPhraseModel> accent_phrases = engine.create_accent_phrases(std::string(text), &speaker_id);
   const AudioQueryModel audio_query = {
-      accent_phrases, 1.0f, 0.0f, 1.0f, 1.0f, 0.1f, 0.1f, engine->default_sampling_rate, false, "",
+      accent_phrases, 1.0f, 0.0f, 1.0f, 1.0f, 0.1f, 0.1f, engine.default_sampling_rate, false, "",
   };
 
-  const auto wav = engine->synthesis_wave_format(audio_query, &speaker_id, output_binary_size);
+  const auto wav = engine.synthesis_wave_format(audio_query, &speaker_id, output_binary_size);
   auto *wav_heap = new uint8_t[*output_binary_size];
   std::copy(wav.begin(), wav.end(), wav_heap);
   *output_wav = wav_heap;
@@ -44,8 +38,8 @@ void voicevox_wav_free(uint8_t *wav) { delete wav; }
 
 const char *voicevox_error_result_to_message(VoicevoxResultCode result_code) {
   switch (result_code) {
-    case VOICEVOX_RESULT_NOT_INITIALIZE_OPEN_JTALK_ERR:
-      return "Call initialize_openjtalk() first.";
+    case VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT:
+      return "Call voicevox_load_openjtalk_dict() first.";
 
     default:
       throw std::runtime_error("Unexpected error result code.");

diff --git a/core/src/engine/openjtalk.cpp b/core/src/engine/openjtalk.cpp
@@ -48,6 +48,7 @@ void OpenJTalk::load(const std::string& dn_mecab) {
     clear();
     throw std::runtime_error("failed to initialize mecab");
   }
+  dict_loaded = true;
 }
 
 void OpenJTalk::clear() {

diff --git a/core/src/engine/openjtalk.h b/core/src/engine/openjtalk.h
@@ -25,13 +25,15 @@ class OpenJTalk {
     JPCommon_initialize(&jpcommon);
   }
 
-  OpenJTalk(const std::string& dn_mecab) : OpenJTalk() { load(dn_mecab); }
-
   ~OpenJTalk() { clear(); }
 
   std::vector<std::string> extract_fullcontext(std::string text);
 
   void load(const std::string& dn_mecab);
   void clear();
+  bool is_dict_loaded() const { return dict_loaded; }
+
+ private:
+  bool dict_loaded = false;
 };
 }  // namespace voicevox::core::engine
diff --git a/core/src/engine/synthesis_engine.cpp b/core/src/engine/synthesis_engine.cpp
@@ -114,7 +114,7 @@ std::vector<AccentPhraseModel> SynthesisEngine::create_accent_phrases(std::strin
     return {};
   }
 
-  Utterance utterance = extract_full_context_label(*m_openjtalk, text);
+  Utterance utterance = extract_full_context_label(m_openjtalk, text);
   if (utterance.breath_groups.empty()) {
     return {};
   }
@@ -513,6 +513,8 @@ std::vector<float> SynthesisEngine::synthesis(AudioQueryModel query, int64_t *sp
   return wave;
 }
 
+void SynthesisEngine::load_openjtalk_dict(const std::string &dict_path) { m_openjtalk.load(dict_path); }
+
 void SynthesisEngine::initial_process(std::vector<AccentPhraseModel> &accent_phrases,
                                       std::vector<MoraModel> &flatten_moras, std::vector<std::string> &phoneme_str_list,
                                       std::vector<OjtPhoneme> &phoneme_data_list) {

diff --git a/core/src/engine/synthesis_engine.h b/core/src/engine/synthesis_engine.h
@@ -26,7 +26,7 @@ class SynthesisEngine {
  public:
   const unsigned int default_sampling_rate = 24000;
 
-  SynthesisEngine(std::shared_ptr<OpenJTalk> openjtalk) : m_openjtalk(openjtalk) {}
+  SynthesisEngine() {}
 
   std::vector<AccentPhraseModel> create_accent_phrases(std::string text, int64_t *speaker_id);
   std::vector<AccentPhraseModel> replace_mora_data(std::vector<AccentPhraseModel> accent_phrases, int64_t *speaker_id);
@@ -37,8 +37,11 @@ class SynthesisEngine {
   std::vector<uint8_t> synthesis_wave_format(AudioQueryModel query, int64_t *speaker_id, int *binary_size,
                                              bool enable_interrogative_upspeak = true);
 
+  void load_openjtalk_dict(const std::string &dict_path);
+  bool is_openjtalk_dict_loaded() const { return m_openjtalk.is_dict_loaded(); }
+
  private:
-  std::shared_ptr<OpenJTalk> m_openjtalk;
+  OpenJTalk m_openjtalk;
 
   void initial_process(std::vector<AccentPhraseModel> &accent_phrases, std::vector<MoraModel> &flatten_moras,
                        std::vector<std::string> &phoneme_str_list, std::vector<OjtPhoneme> &phoneme_data_list);