openvinotoolkit · Wovchena · Sep 19, 2024 · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -510,7 +510,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup
+          assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
           "
           echo "Prompt lookup" passed
       - name: run and compare (model with seq_length_axis = 1)
@@ -531,7 +531,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup
+          assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
           "
           echo "Prompt lookup" passed
 

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -287,7 +287,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         m_requests.clear();
     };
 
-    bool continue_generation = true, step_throws_exception = false;
+    bool continue_generation = true;
     while (has_non_finished_requests() && continue_generation) {
         try {
             step();

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <nlohmann/json.hpp>
 #include <openvino/openvino.hpp>
+#include <limits>
 #include "openvino/genai/continuous_batching_pipeline.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
@@ -548,6 +549,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     OptionalGenerationConfig generation_config
 ) {
+    OPENVINO_THROW("Not supported");
     auto start_time = std::chrono::steady_clock::now();
     m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
     auto stop_time = std::chrono::steady_clock::now();
@@ -560,12 +562,25 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& properties
 ){
+    // std::cout << "Using continuous batching backend.\n";
     auto start_time = std::chrono::steady_clock::now();
     if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) {
         auto config_without_scheduler_config = properties;
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, config_without_scheduler_config);
+        // std::cout << "Found custom SchedulerConfig.\n";
+    } else if (true) {
+        SchedulerConfig scheduler_config;
+        scheduler_config.cache_size = 1;
+        scheduler_config.enable_prefix_caching = false;
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
+            models_path,
+            tokenizer,
+            scheduler_config,
+            device,
+            properties
+        );
     } else if ("NPU" == device) {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
     } else {
@@ -580,12 +595,23 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& config
 ){
+    // std::cout << "Using continuous batching backend.\n";
     auto start_time = std::chrono::steady_clock::now();
     if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
         auto config_without_scheduler_config = config;
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, config_without_scheduler_config);
+    } else if (true) {
+        SchedulerConfig scheduler_config;
+        scheduler_config.cache_size = 1;
+        scheduler_config.enable_prefix_caching = false;
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
+            models_path,
+            scheduler_config,
+            device,
+            config
+        );
     } else if ("NPU" == device) {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);
     } else {
@@ -618,6 +644,9 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi
     if (config.eos_token_id == -1)
         m_pimpl->m_generation_config.eos_token_id = default_eos_token_id;
 
+    if (config.max_new_tokens == SIZE_MAX)
+        m_pimpl->m_generation_config.max_new_tokens = 100;
+
     m_pimpl->m_generation_config.validate();
 }