From 00bbab029bed4c293be6faf79b7579a3d33a3ee8 Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Wed, 23 Oct 2024 17:12:37 +0200 Subject: [PATCH] Revert "handle stop words in WeakAnd" --- .../proton/matching/blueprintbuilder.cpp | 1 - .../proton/matching/match_tools.cpp | 8 +---- .../queryeval/weak_and/wand_bench_setup.hpp | 6 ++-- .../queryeval/weak_and/weak_and_test.cpp | 31 ++----------------- .../attribute/attribute_blueprint_params.h | 10 ++---- .../vespa/searchlib/fef/indexproperties.cpp | 16 ---------- .../src/vespa/searchlib/fef/indexproperties.h | 11 ------- .../src/vespa/searchlib/fef/ranksetup.cpp | 1 - searchlib/src/vespa/searchlib/fef/ranksetup.h | 3 -- .../queryeval/intermediate_blueprints.cpp | 3 +- .../queryeval/intermediate_blueprints.h | 5 ++- .../queryeval/wand/parallel_weak_and_search.h | 2 +- .../searchlib/queryeval/wand/wand_parts.h | 6 ++-- .../queryeval/wand/weak_and_search.cpp | 19 +----------- 14 files changed, 17 insertions(+), 105 deletions(-) diff --git a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp index 45953a9ac0c0..f8b6666afc4f 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp @@ -72,7 +72,6 @@ class BlueprintBuilderVisitor : void buildWeakAnd(ProtonWeakAnd &n) { auto *wand = new WeakAndBlueprint(n.getTargetNumHits(), _requestContext.get_attribute_blueprint_params().weakand_range, - _requestContext.get_attribute_blueprint_params().abs_weakand_stop_word_limit, is_search_multi_threaded()); Blueprint::UP result(wand); for (auto node : n.getChildren()) { diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp index 64d4dddc5dbd..93989332bdca 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp @@ -351,11 +351,6 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu double target_hits_max_adjustment_factor = TargetHitsMaxAdjustmentFactor::lookup(rank_properties, rank_setup.get_target_hits_max_adjustment_factor()); auto fuzzy_matching_algorithm = FuzzyAlgorithm::lookup(rank_properties, rank_setup.get_fuzzy_matching_algorithm()); double weakand_range = temporary::WeakAndRange::lookup(rank_properties, rank_setup.get_weakand_range()); - double weakand_stop_word_limit = WeakAndStopWordLimit::lookup(rank_properties, rank_setup.get_weakand_stop_word_limit()); - - // make sure no words are stop words if the limit is 1.0, even those claiming to match more than everything - uint32_t abs_weakand_stop_word_limit = (weakand_stop_word_limit >= 0.0 && weakand_stop_word_limit < 1.0) - ? uint32_t(weakand_stop_word_limit * docid_limit) : uint32_t(-1); // Note that we count the reserved docid 0 as active. // This ensures that when searchable-copies=1, the ratio is 1.0. @@ -365,8 +360,7 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu upper_limit * active_hit_ratio, target_hits_max_adjustment_factor, fuzzy_matching_algorithm, - weakand_range, - abs_weakand_stop_word_limit}; + weakand_range}; } AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext, diff --git a/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp b/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp index 1766cc214955..460de0b3cc71 100644 --- a/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp +++ b/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp @@ -110,7 +110,7 @@ struct VespaWandFactory : WandFactory { ~VespaWandFactory() override; std::string name() const override { return make_string("VESPA WAND (n=%u)", n); } SearchIterator::UP create(const wand::Terms &terms) override { - return WeakAndSearch::create(terms, wand::MatchParams(_scores, 1, -1, 1), n, true, false); + return WeakAndSearch::create(terms, wand::MatchParams(_scores, 1, 1), n, true, false); } }; @@ -126,7 +126,7 @@ struct VespaArrayWandFactory : WandFactory { ~VespaArrayWandFactory() override; std::string name() const override { return make_string("VESPA ARRAY WAND (n=%u)", n); } SearchIterator::UP create(const wand::Terms &terms) override { - return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, 1, -1, 1), wand::TermFrequencyScorer(), n, true, false); + return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, 1, 1), wand::TermFrequencyScorer(), n, true, false); } }; @@ -142,7 +142,7 @@ struct VespaHeapWandFactory : WandFactory { ~VespaHeapWandFactory() override; std::string name() const override { return make_string("VESPA HEAP WAND (n=%u)", n); } SearchIterator::UP create(const wand::Terms &terms) override { - return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, 1, -1, 1), wand::TermFrequencyScorer(), n, true, false); + return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, 1, 1), wand::TermFrequencyScorer(), n, true, false); } }; diff --git a/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp b/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp index bbf47cb61590..3fc31e4456d5 100644 --- a/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp +++ b/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp @@ -23,28 +23,19 @@ struct MyWandSpec : public WandSpec SharedWeakAndPriorityQueue scores; uint32_t n; MatchingPhase matching_phase; - wand::MatchParams my_params; explicit MyWandSpec(uint32_t n_in) : WandSpec(), scores(n_in), n(n_in), - matching_phase(MatchingPhase::FIRST_PHASE), - my_params(scores, 1, -1, 1) + matching_phase(MatchingPhase::FIRST_PHASE) {} SearchIterator *create() { bool readonly_scores_heap = (matching_phase != MatchingPhase::FIRST_PHASE); return new TrackedSearch("WAND", getHistory(), - WeakAndSearch::create(getTerms(), my_params, n, true, readonly_scores_heap)); + WeakAndSearch::create(getTerms(), wand::MatchParams(scores, 1, 1), n, true, readonly_scores_heap)); } void set_second_phase() { matching_phase = MatchingPhase::SECOND_PHASE; } - void set_abs_stop_word_limit(uint32_t limit) { my_params.abs_stop_word_limit = limit; } - SimpleResult search() { - SearchIterator::UP search(create()); - SimpleResult hits; - hits.search(*search); - return hits; - } }; struct SimpleWandFixture { @@ -149,22 +140,6 @@ TEST(WeakAndTest, require_that_initial_docid_for_subsearches_are_taken_into_acco history); } -TEST(WeakAndTest, require_that_the_worst_normal_word_must_match_when_using_stop_word_limit) { - MyWandSpec spec(1000); // avoid limiting hits with heap - spec.leaf(LeafSpec("1").doc(1).doc(2).doc(3).doc(4)); - spec.leaf(LeafSpec("2").doc(5).doc(6).doc(7)); - spec.leaf(LeafSpec("3").doc(8).doc(9)); - spec.set_abs_stop_word_limit(4); - EXPECT_EQ(SimpleResult().addHit(1).addHit(2).addHit(3).addHit(4).addHit(5) - .addHit(6).addHit(7).addHit(8).addHit(9), spec.search()); - spec.set_abs_stop_word_limit(3); - EXPECT_EQ(SimpleResult().addHit(5).addHit(6).addHit(7).addHit(8).addHit(9), spec.search()); - spec.set_abs_stop_word_limit(2); - EXPECT_EQ(SimpleResult().addHit(8).addHit(9), spec.search()); // Note: must match only non-stop word - spec.set_abs_stop_word_limit(1); - EXPECT_EQ(SimpleResult().addHit(8).addHit(9), spec.search()); // Note: must match best stop word -} - class IteratorChildrenVerifier : public search::test::IteratorChildrenVerifier { public: IteratorChildrenVerifier(); @@ -179,7 +154,7 @@ class IteratorChildrenVerifier : public search::test::IteratorChildrenVerifier { } static constexpr size_t LARGE_ENOUGH_HEAP_FOR_ALL = 10000; _scores.push_back(std::make_unique(LARGE_ENOUGH_HEAP_FOR_ALL)); - return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), 1, -1, 1), -1, strict, false); + return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), 1, 1), -1, strict, false); } }; diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h index 0f77baafe29e..ac6fc6f603a2 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h @@ -17,20 +17,17 @@ struct AttributeBlueprintParams double target_hits_max_adjustment_factor; vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm; double weakand_range; - uint32_t abs_weakand_stop_word_limit; AttributeBlueprintParams(double global_filter_lower_limit_in, double global_filter_upper_limit_in, double target_hits_max_adjustment_factor_in, vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in, - double weakand_range_in, - uint32_t abs_wand_stop_word_limit_in) + double weakand_range_in) : global_filter_lower_limit(global_filter_lower_limit_in), global_filter_upper_limit(global_filter_upper_limit_in), target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in), fuzzy_matching_algorithm(fuzzy_matching_algorithm_in), - weakand_range(weakand_range_in), - abs_weakand_stop_word_limit(abs_wand_stop_word_limit_in) + weakand_range(weakand_range_in) { } @@ -39,8 +36,7 @@ struct AttributeBlueprintParams fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE, fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE, fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE, - fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE, - fef::indexproperties::matching::WeakAndStopWordLimit::DEFAULT_VALUE) + fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE) { } }; diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index f09e3dbbaac2..f67af176ead6 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -444,22 +444,6 @@ GlobalFilterUpperLimit::lookup(const Properties &props, double defaultValue) return lookupDouble(props, NAME, defaultValue); } -const std::string WeakAndStopWordLimit::NAME("vespa.matching.weakand.stop_word_limit"); - -const double WeakAndStopWordLimit::DEFAULT_VALUE(1.0); - -double -WeakAndStopWordLimit::lookup(const Properties &props) -{ - return lookup(props, DEFAULT_VALUE); -} - -double -WeakAndStopWordLimit::lookup(const Properties &props, double defaultValue) -{ - return lookupDouble(props, NAME, defaultValue); -} - const std::string TargetHitsMaxAdjustmentFactor::NAME("vespa.matching.nns.target_hits_max_adjustment_factor"); const double TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE(20.0); diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index c0904a94ff39..7494eacb6282 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -341,16 +341,6 @@ namespace matching { static double lookup(const Properties &props, double defaultValue); }; - /** - * Property to control how much of the corpus a single term must match to be treated as a stop word by Vespa Wand - **/ - struct WeakAndStopWordLimit { - static const std::string NAME; - static const double DEFAULT_VALUE; - static double lookup(const Properties &props); - static double lookup(const Properties &props, double defaultValue); - }; - /** * Property to control the algorithm using for fuzzy matching. **/ @@ -360,7 +350,6 @@ namespace matching { static vespalib::FuzzyMatchingAlgorithm lookup(const Properties& props); static vespalib::FuzzyMatchingAlgorithm lookup(const Properties& props, vespalib::FuzzyMatchingAlgorithm default_value); }; - /** * Sort blueprints based on relative cost estimate rather than est_hits **/ diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index 768e7e04a057..2410a86b2ab8 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -132,7 +132,6 @@ RankSetup::configure() set_target_hits_max_adjustment_factor(matching::TargetHitsMaxAdjustmentFactor::lookup(_indexEnv.getProperties())); set_fuzzy_matching_algorithm(matching::FuzzyAlgorithm::lookup(_indexEnv.getProperties())); set_weakand_range(temporary::WeakAndRange::lookup(_indexEnv.getProperties())); - set_weakand_stop_word_limit(matching::WeakAndStopWordLimit::lookup(_indexEnv.getProperties())); _mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties()); _mutateOnMatch._operation = mutate::on_match::Operation::lookup(_indexEnv.getProperties()); _mutateOnFirstPhase._attribute = mutate::on_first_phase::Attribute::lookup(_indexEnv.getProperties()); diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index 8b656eb771f8..a0198b194cf9 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -84,7 +84,6 @@ class RankSetup double _global_filter_upper_limit; double _target_hits_max_adjustment_factor; double _weakand_range; - double _weakand_stop_word_limit; vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm; MutateOperation _mutateOnMatch; MutateOperation _mutateOnFirstPhase; @@ -413,8 +412,6 @@ class RankSetup vespalib::FuzzyMatchingAlgorithm get_fuzzy_matching_algorithm() const { return _fuzzy_matching_algorithm; } void set_weakand_range(double v) { _weakand_range = v; } double get_weakand_range() const { return _weakand_range; } - void set_weakand_stop_word_limit(double v) { _weakand_stop_word_limit = v; } - double get_weakand_stop_word_limit() const { return _weakand_stop_word_limit; } /** * This method may be used to indicate that certain features diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp index dab867a79ee0..72feeaebc13a 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp @@ -419,11 +419,10 @@ WeakAndBlueprint::my_flow(InFlow in_flow) const return AnyFlow::create(in_flow); } -WeakAndBlueprint::WeakAndBlueprint(uint32_t n, float idf_range, uint32_t abs_stop_word_limit, bool thread_safe) +WeakAndBlueprint::WeakAndBlueprint(uint32_t n, float idf_range, bool thread_safe) : _scores(WeakAndPriorityQueue::createHeap(n, thread_safe)), _n(n), _idf_range(idf_range), - _abs_stop_word_limit(abs_stop_word_limit), _weights(), _matching_phase(MatchingPhase::FIRST_PHASE) {} diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h index 0e74b8edf1fa..016ee67cac8c 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h @@ -92,7 +92,6 @@ class WeakAndBlueprint : public IntermediateBlueprint std::unique_ptr _scores; uint32_t _n; float _idf_range; - uint32_t _abs_stop_word_limit; std::vector _weights; MatchingPhase _matching_phase; @@ -110,8 +109,8 @@ class WeakAndBlueprint : public IntermediateBlueprint fef::MatchData &md) const override; SearchIterator::UP createFilterSearch(FilterConstraint constraint) const override; - explicit WeakAndBlueprint(uint32_t n) : WeakAndBlueprint(n, 0.0, -1, true) {} - WeakAndBlueprint(uint32_t n, float idf_range, uint32_t abs_stop_word_limit, bool thread_safe); + explicit WeakAndBlueprint(uint32_t n) : WeakAndBlueprint(n, 0.0, true) {} + WeakAndBlueprint(uint32_t n, float idf_range, bool thread_safe); ~WeakAndBlueprint() override; void addTerm(Blueprint::UP bp, uint32_t weight) { addChild(std::move(bp)); diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.h b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.h index 0fae6be9d144..130bd46f470f 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.h @@ -29,7 +29,7 @@ struct ParallelWeakAndSearch : public SearchIterator double thresholdBoostFactor_in, uint32_t scoresAdjustFrequency_in, uint32_t docIdLimit_in) noexcept - : wand::MatchParams(scores_in, scoreThreshold_in, -1, scoresAdjustFrequency_in), + : wand::MatchParams(scores_in, scoreThreshold_in, scoresAdjustFrequency_in), thresholdBoostFactor(thresholdBoostFactor_in), docIdLimit(docIdLimit_in) {} diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h index 2abb0ed8bc83..69b6c4d78ce4 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h @@ -35,15 +35,13 @@ struct MatchParams { WeakAndHeap &scores; score_t scoreThreshold; - uint32_t abs_stop_word_limit; const uint32_t scoresAdjustFrequency; MatchParams(WeakAndHeap &scores_in) noexcept - : MatchParams(scores_in, 1, -1, DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY) + : MatchParams(scores_in, 1, DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY) {} - MatchParams(WeakAndHeap &scores_in, score_t scoreThreshold_in, uint32_t abs_stop_word_limit_in, uint32_t scoresAdjustFrequency_in) noexcept + MatchParams(WeakAndHeap &scores_in, score_t scoreThreshold_in, uint32_t scoresAdjustFrequency_in) noexcept : scores(scores_in), scoreThreshold(scoreThreshold_in), - abs_stop_word_limit(abs_stop_word_limit_in), scoresAdjustFrequency(scoresAdjustFrequency_in) {} }; diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp index 23e35e13d653..6f0a4b8f8259 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp @@ -10,23 +10,6 @@ namespace search::queryeval { namespace wand { -score_t calculate_initial_wand_threshold(const auto &scorer, const Terms &terms, const MatchParams &matchParams) { - std::optional best_stop_word; - std::optional worst_normal_word; - for (const auto &t: terms) { - score_t s = scorer.calculateMaxScore(t); - if (t.estHits > matchParams.abs_stop_word_limit) { - best_stop_word = std::max(s, best_stop_word.value_or(s)); - } else { - worst_normal_word = std::min(s, worst_normal_word.value_or(s)); - } - } - score_t limit = matchParams.scoreThreshold; - limit = std::max(limit, best_stop_word.value_or(limit)); - limit = std::max(limit, worst_normal_word.value_or(limit)); - return limit; -} - template class WeakAndSearchLR final : public WeakAndSearch { @@ -71,7 +54,7 @@ class WeakAndSearchLR final : public WeakAndSearch : _terms(terms, scorer, 0, {}), _heaps(DocIdOrder(_terms.docId()), _terms.size()), _algo(), - _threshold(calculate_initial_wand_threshold(scorer, terms, matchParams)), + _threshold(matchParams.scoreThreshold), _matchParams(matchParams), _localScores(), _n(n),