Skip to content

Commit

Permalink
Revert "handle stop words in WeakAnd"
Browse files Browse the repository at this point in the history
  • Loading branch information
hmusum authored Oct 23, 2024
1 parent cac7a8c commit 00bbab0
Show file tree
Hide file tree
Showing 14 changed files with 17 additions and 105 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ class BlueprintBuilderVisitor :
void buildWeakAnd(ProtonWeakAnd &n) {
auto *wand = new WeakAndBlueprint(n.getTargetNumHits(),
_requestContext.get_attribute_blueprint_params().weakand_range,
_requestContext.get_attribute_blueprint_params().abs_weakand_stop_word_limit,
is_search_multi_threaded());
Blueprint::UP result(wand);
for (auto node : n.getChildren()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,6 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu
double target_hits_max_adjustment_factor = TargetHitsMaxAdjustmentFactor::lookup(rank_properties, rank_setup.get_target_hits_max_adjustment_factor());
auto fuzzy_matching_algorithm = FuzzyAlgorithm::lookup(rank_properties, rank_setup.get_fuzzy_matching_algorithm());
double weakand_range = temporary::WeakAndRange::lookup(rank_properties, rank_setup.get_weakand_range());
double weakand_stop_word_limit = WeakAndStopWordLimit::lookup(rank_properties, rank_setup.get_weakand_stop_word_limit());

// make sure no words are stop words if the limit is 1.0, even those claiming to match more than everything
uint32_t abs_weakand_stop_word_limit = (weakand_stop_word_limit >= 0.0 && weakand_stop_word_limit < 1.0)
? uint32_t(weakand_stop_word_limit * docid_limit) : uint32_t(-1);

// Note that we count the reserved docid 0 as active.
// This ensures that when searchable-copies=1, the ratio is 1.0.
Expand All @@ -365,8 +360,7 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu
upper_limit * active_hit_ratio,
target_hits_max_adjustment_factor,
fuzzy_matching_algorithm,
weakand_range,
abs_weakand_stop_word_limit};
weakand_range};
}

AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext,
Expand Down
6 changes: 3 additions & 3 deletions searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ struct VespaWandFactory : WandFactory {
~VespaWandFactory() override;
std::string name() const override { return make_string("VESPA WAND (n=%u)", n); }
SearchIterator::UP create(const wand::Terms &terms) override {
return WeakAndSearch::create(terms, wand::MatchParams(_scores, 1, -1, 1), n, true, false);
return WeakAndSearch::create(terms, wand::MatchParams(_scores, 1, 1), n, true, false);
}
};

Expand All @@ -126,7 +126,7 @@ struct VespaArrayWandFactory : WandFactory {
~VespaArrayWandFactory() override;
std::string name() const override { return make_string("VESPA ARRAY WAND (n=%u)", n); }
SearchIterator::UP create(const wand::Terms &terms) override {
return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, 1, -1, 1), wand::TermFrequencyScorer(), n, true, false);
return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, 1, 1), wand::TermFrequencyScorer(), n, true, false);
}
};

Expand All @@ -142,7 +142,7 @@ struct VespaHeapWandFactory : WandFactory {
~VespaHeapWandFactory() override;
std::string name() const override { return make_string("VESPA HEAP WAND (n=%u)", n); }
SearchIterator::UP create(const wand::Terms &terms) override {
return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, 1, -1, 1), wand::TermFrequencyScorer(), n, true, false);
return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, 1, 1), wand::TermFrequencyScorer(), n, true, false);
}
};

Expand Down
31 changes: 3 additions & 28 deletions searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,19 @@ struct MyWandSpec : public WandSpec
SharedWeakAndPriorityQueue scores;
uint32_t n;
MatchingPhase matching_phase;
wand::MatchParams my_params;

explicit MyWandSpec(uint32_t n_in)
: WandSpec(),
scores(n_in),
n(n_in),
matching_phase(MatchingPhase::FIRST_PHASE),
my_params(scores, 1, -1, 1)
matching_phase(MatchingPhase::FIRST_PHASE)
{}
SearchIterator *create() {
bool readonly_scores_heap = (matching_phase != MatchingPhase::FIRST_PHASE);
return new TrackedSearch("WAND", getHistory(),
WeakAndSearch::create(getTerms(), my_params, n, true, readonly_scores_heap));
WeakAndSearch::create(getTerms(), wand::MatchParams(scores, 1, 1), n, true, readonly_scores_heap));
}
void set_second_phase() { matching_phase = MatchingPhase::SECOND_PHASE; }
void set_abs_stop_word_limit(uint32_t limit) { my_params.abs_stop_word_limit = limit; }
SimpleResult search() {
SearchIterator::UP search(create());
SimpleResult hits;
hits.search(*search);
return hits;
}
};

struct SimpleWandFixture {
Expand Down Expand Up @@ -149,22 +140,6 @@ TEST(WeakAndTest, require_that_initial_docid_for_subsearches_are_taken_into_acco
history);
}

TEST(WeakAndTest, require_that_the_worst_normal_word_must_match_when_using_stop_word_limit) {
MyWandSpec spec(1000); // avoid limiting hits with heap
spec.leaf(LeafSpec("1").doc(1).doc(2).doc(3).doc(4));
spec.leaf(LeafSpec("2").doc(5).doc(6).doc(7));
spec.leaf(LeafSpec("3").doc(8).doc(9));
spec.set_abs_stop_word_limit(4);
EXPECT_EQ(SimpleResult().addHit(1).addHit(2).addHit(3).addHit(4).addHit(5)
.addHit(6).addHit(7).addHit(8).addHit(9), spec.search());
spec.set_abs_stop_word_limit(3);
EXPECT_EQ(SimpleResult().addHit(5).addHit(6).addHit(7).addHit(8).addHit(9), spec.search());
spec.set_abs_stop_word_limit(2);
EXPECT_EQ(SimpleResult().addHit(8).addHit(9), spec.search()); // Note: must match only non-stop word
spec.set_abs_stop_word_limit(1);
EXPECT_EQ(SimpleResult().addHit(8).addHit(9), spec.search()); // Note: must match best stop word
}

class IteratorChildrenVerifier : public search::test::IteratorChildrenVerifier {
public:
IteratorChildrenVerifier();
Expand All @@ -179,7 +154,7 @@ class IteratorChildrenVerifier : public search::test::IteratorChildrenVerifier {
}
static constexpr size_t LARGE_ENOUGH_HEAP_FOR_ALL = 10000;
_scores.push_back(std::make_unique<SharedWeakAndPriorityQueue>(LARGE_ENOUGH_HEAP_FOR_ALL));
return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), 1, -1, 1), -1, strict, false);
return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), 1, 1), -1, strict, false);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,17 @@ struct AttributeBlueprintParams
double target_hits_max_adjustment_factor;
vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm;
double weakand_range;
uint32_t abs_weakand_stop_word_limit;

AttributeBlueprintParams(double global_filter_lower_limit_in,
double global_filter_upper_limit_in,
double target_hits_max_adjustment_factor_in,
vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in,
double weakand_range_in,
uint32_t abs_wand_stop_word_limit_in)
double weakand_range_in)
: global_filter_lower_limit(global_filter_lower_limit_in),
global_filter_upper_limit(global_filter_upper_limit_in),
target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in),
fuzzy_matching_algorithm(fuzzy_matching_algorithm_in),
weakand_range(weakand_range_in),
abs_weakand_stop_word_limit(abs_wand_stop_word_limit_in)
weakand_range(weakand_range_in)
{
}

Expand All @@ -39,8 +36,7 @@ struct AttributeBlueprintParams
fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE,
fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE,
fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE,
fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE,
fef::indexproperties::matching::WeakAndStopWordLimit::DEFAULT_VALUE)
fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE)
{
}
};
Expand Down
16 changes: 0 additions & 16 deletions searchlib/src/vespa/searchlib/fef/indexproperties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,22 +444,6 @@ GlobalFilterUpperLimit::lookup(const Properties &props, double defaultValue)
return lookupDouble(props, NAME, defaultValue);
}

const std::string WeakAndStopWordLimit::NAME("vespa.matching.weakand.stop_word_limit");

const double WeakAndStopWordLimit::DEFAULT_VALUE(1.0);

double
WeakAndStopWordLimit::lookup(const Properties &props)
{
return lookup(props, DEFAULT_VALUE);
}

double
WeakAndStopWordLimit::lookup(const Properties &props, double defaultValue)
{
return lookupDouble(props, NAME, defaultValue);
}

const std::string TargetHitsMaxAdjustmentFactor::NAME("vespa.matching.nns.target_hits_max_adjustment_factor");

const double TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE(20.0);
Expand Down
11 changes: 0 additions & 11 deletions searchlib/src/vespa/searchlib/fef/indexproperties.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,16 +341,6 @@ namespace matching {
static double lookup(const Properties &props, double defaultValue);
};

/**
* Property to control how much of the corpus a single term must match to be treated as a stop word by Vespa Wand
**/
struct WeakAndStopWordLimit {
static const std::string NAME;
static const double DEFAULT_VALUE;
static double lookup(const Properties &props);
static double lookup(const Properties &props, double defaultValue);
};

/**
* Property to control the algorithm using for fuzzy matching.
**/
Expand All @@ -360,7 +350,6 @@ namespace matching {
static vespalib::FuzzyMatchingAlgorithm lookup(const Properties& props);
static vespalib::FuzzyMatchingAlgorithm lookup(const Properties& props, vespalib::FuzzyMatchingAlgorithm default_value);
};

/**
* Sort blueprints based on relative cost estimate rather than est_hits
**/
Expand Down
1 change: 0 additions & 1 deletion searchlib/src/vespa/searchlib/fef/ranksetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ RankSetup::configure()
set_target_hits_max_adjustment_factor(matching::TargetHitsMaxAdjustmentFactor::lookup(_indexEnv.getProperties()));
set_fuzzy_matching_algorithm(matching::FuzzyAlgorithm::lookup(_indexEnv.getProperties()));
set_weakand_range(temporary::WeakAndRange::lookup(_indexEnv.getProperties()));
set_weakand_stop_word_limit(matching::WeakAndStopWordLimit::lookup(_indexEnv.getProperties()));
_mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties());
_mutateOnMatch._operation = mutate::on_match::Operation::lookup(_indexEnv.getProperties());
_mutateOnFirstPhase._attribute = mutate::on_first_phase::Attribute::lookup(_indexEnv.getProperties());
Expand Down
3 changes: 0 additions & 3 deletions searchlib/src/vespa/searchlib/fef/ranksetup.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ class RankSetup
double _global_filter_upper_limit;
double _target_hits_max_adjustment_factor;
double _weakand_range;
double _weakand_stop_word_limit;
vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm;
MutateOperation _mutateOnMatch;
MutateOperation _mutateOnFirstPhase;
Expand Down Expand Up @@ -413,8 +412,6 @@ class RankSetup
vespalib::FuzzyMatchingAlgorithm get_fuzzy_matching_algorithm() const { return _fuzzy_matching_algorithm; }
void set_weakand_range(double v) { _weakand_range = v; }
double get_weakand_range() const { return _weakand_range; }
void set_weakand_stop_word_limit(double v) { _weakand_stop_word_limit = v; }
double get_weakand_stop_word_limit() const { return _weakand_stop_word_limit; }

/**
* This method may be used to indicate that certain features
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -419,11 +419,10 @@ WeakAndBlueprint::my_flow(InFlow in_flow) const
return AnyFlow::create<OrFlow>(in_flow);
}

WeakAndBlueprint::WeakAndBlueprint(uint32_t n, float idf_range, uint32_t abs_stop_word_limit, bool thread_safe)
WeakAndBlueprint::WeakAndBlueprint(uint32_t n, float idf_range, bool thread_safe)
: _scores(WeakAndPriorityQueue::createHeap(n, thread_safe)),
_n(n),
_idf_range(idf_range),
_abs_stop_word_limit(abs_stop_word_limit),
_weights(),
_matching_phase(MatchingPhase::FIRST_PHASE)
{}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ class WeakAndBlueprint : public IntermediateBlueprint
std::unique_ptr<WeakAndPriorityQueue> _scores;
uint32_t _n;
float _idf_range;
uint32_t _abs_stop_word_limit;
std::vector<uint32_t> _weights;
MatchingPhase _matching_phase;

Expand All @@ -110,8 +109,8 @@ class WeakAndBlueprint : public IntermediateBlueprint
fef::MatchData &md) const override;
SearchIterator::UP createFilterSearch(FilterConstraint constraint) const override;

explicit WeakAndBlueprint(uint32_t n) : WeakAndBlueprint(n, 0.0, -1, true) {}
WeakAndBlueprint(uint32_t n, float idf_range, uint32_t abs_stop_word_limit, bool thread_safe);
explicit WeakAndBlueprint(uint32_t n) : WeakAndBlueprint(n, 0.0, true) {}
WeakAndBlueprint(uint32_t n, float idf_range, bool thread_safe);
~WeakAndBlueprint() override;
void addTerm(Blueprint::UP bp, uint32_t weight) {
addChild(std::move(bp));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ struct ParallelWeakAndSearch : public SearchIterator
double thresholdBoostFactor_in,
uint32_t scoresAdjustFrequency_in,
uint32_t docIdLimit_in) noexcept
: wand::MatchParams(scores_in, scoreThreshold_in, -1, scoresAdjustFrequency_in),
: wand::MatchParams(scores_in, scoreThreshold_in, scoresAdjustFrequency_in),
thresholdBoostFactor(thresholdBoostFactor_in),
docIdLimit(docIdLimit_in)
{}
Expand Down
6 changes: 2 additions & 4 deletions searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,13 @@ struct MatchParams
{
WeakAndHeap &scores;
score_t scoreThreshold;
uint32_t abs_stop_word_limit;
const uint32_t scoresAdjustFrequency;
MatchParams(WeakAndHeap &scores_in) noexcept
: MatchParams(scores_in, 1, -1, DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY)
: MatchParams(scores_in, 1, DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY)
{}
MatchParams(WeakAndHeap &scores_in, score_t scoreThreshold_in, uint32_t abs_stop_word_limit_in, uint32_t scoresAdjustFrequency_in) noexcept
MatchParams(WeakAndHeap &scores_in, score_t scoreThreshold_in, uint32_t scoresAdjustFrequency_in) noexcept
: scores(scores_in),
scoreThreshold(scoreThreshold_in),
abs_stop_word_limit(abs_stop_word_limit_in),
scoresAdjustFrequency(scoresAdjustFrequency_in)
{}
};
Expand Down
19 changes: 1 addition & 18 deletions searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,6 @@
namespace search::queryeval {
namespace wand {

score_t calculate_initial_wand_threshold(const auto &scorer, const Terms &terms, const MatchParams &matchParams) {
std::optional<score_t> best_stop_word;
std::optional<score_t> worst_normal_word;
for (const auto &t: terms) {
score_t s = scorer.calculateMaxScore(t);
if (t.estHits > matchParams.abs_stop_word_limit) {
best_stop_word = std::max(s, best_stop_word.value_or(s));
} else {
worst_normal_word = std::min(s, worst_normal_word.value_or(s));
}
}
score_t limit = matchParams.scoreThreshold;
limit = std::max(limit, best_stop_word.value_or(limit));
limit = std::max(limit, worst_normal_word.value_or(limit));
return limit;
}

template <typename FutureHeap, typename PastHeap, bool IS_STRICT>
class WeakAndSearchLR final : public WeakAndSearch
{
Expand Down Expand Up @@ -71,7 +54,7 @@ class WeakAndSearchLR final : public WeakAndSearch
: _terms(terms, scorer, 0, {}),
_heaps(DocIdOrder(_terms.docId()), _terms.size()),
_algo(),
_threshold(calculate_initial_wand_threshold(scorer, terms, matchParams)),
_threshold(matchParams.scoreThreshold),
_matchParams(matchParams),
_localScores(),
_n(n),
Expand Down

0 comments on commit 00bbab0

Please sign in to comment.