From a9b519b384d861905753023d1c3d9165a590887b Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Tue, 19 Dec 2023 12:58:18 +0530 Subject: [PATCH 01/30] Use Collector.setWeight to improve aggregation performance Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 26 +++++++++++++++++++ .../aggregations/support/ValuesSource.java | 2 +- .../search/internal/ContextIndexSearcher.java | 5 ++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 5ed899408ab40..b3552360296b8 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -37,6 +37,9 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; @@ -61,6 +64,7 @@ import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes; import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic; import org.opensearch.search.aggregations.support.ValuesSource; +import org.opensearch.search.aggregations.support.ValuesSource.Bytes.WithOrdinals; import org.opensearch.search.internal.SearchContext; import java.io.IOException; @@ -70,6 +74,7 @@ import java.util.function.Function; import java.util.function.LongPredicate; import java.util.function.LongUnaryOperator; +import java.util.logging.Logger; import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder; import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; @@ -80,11 +85,16 @@ * @opensearch.internal */ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator { + + // testing only - will remove + protected Logger logger = Logger.getLogger(GlobalOrdinalsStringTermsAggregator.class.getName()); protected final ResultStrategy resultStrategy; protected final ValuesSource.Bytes.WithOrdinals valuesSource; private final LongPredicate acceptedGlobalOrdinals; private final long valueCount; + + private Weight weight; private final GlobalOrdLookupFunction lookupGlobalOrd; protected final CollectionStrategy collectionStrategy; protected int segmentsWithSingleValuedOrds = 0; @@ -142,8 +152,24 @@ String descriptCollectionStrategy() { return collectionStrategy.describe(); } + public void setWeight(Weight weight) { + this.weight = weight; + } + @Override public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException { + if (weight != null && weight.getQuery() instanceof MatchAllDocsQuery) { + if ((weight.count(ctx) == 0) + && Terms.getTerms(ctx.reader(), String.valueOf(((WithOrdinals.FieldData) valuesSource).indexFieldData.getFieldName())) + .size() == 0) { + return LeafBucketCollector.NO_OP_COLLECTOR; + // } else if (weight.count(ctx) == ctx.reader().maxDoc() && weight.getQuery() instanceof MatchAllDocsQuery) { + // no deleted documents & top level query matches everything + // iterate over the terms - doc frequency for each termsEnum directly + // return appropriate LeafCollector + } + } + SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx); collectionStrategy.globalOrdsReady(globalOrds); SortedDocValues singleValues = DocValues.unwrapSingleton(globalOrds); diff --git a/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java b/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java index 3ce1f0447dfcc..c3a5d445be7a1 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java +++ b/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java @@ -238,7 +238,7 @@ public long globalMaxOrd(IndexSearcher indexSearcher) throws IOException { */ public static class FieldData extends WithOrdinals { - protected final IndexOrdinalsFieldData indexFieldData; + public final IndexOrdinalsFieldData indexFieldData; public FieldData(IndexOrdinalsFieldData indexFieldData) { this.indexFieldData = indexFieldData; diff --git a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java index 403b0b545c113..ec3ed2332d0b8 100644 --- a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java +++ b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java @@ -387,6 +387,11 @@ public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { return null; } } + + @Override + public int count(LeafReaderContext context) throws IOException { + return weight.count(context); + } }; } else { return weight; From 21761c9af1fc9dd0ab2fe41ca95384e892fb4665 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Tue, 23 Jan 2024 14:15:01 -0800 Subject: [PATCH 02/30] Add case for no deleted docs Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 99 ++++++++++++++++--- 1 file changed, 86 insertions(+), 13 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index b3552360296b8..21008ba0e2083 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -35,10 +35,11 @@ import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; -import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -49,6 +50,7 @@ import org.opensearch.common.util.LongHash; import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.index.mapper.DocCountFieldMapper; import org.opensearch.search.DocValueFormat; import org.opensearch.search.aggregations.AggregationExecutionException; import org.opensearch.search.aggregations.Aggregator; @@ -64,7 +66,6 @@ import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes; import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic; import org.opensearch.search.aggregations.support.ValuesSource; -import org.opensearch.search.aggregations.support.ValuesSource.Bytes.WithOrdinals; import org.opensearch.search.internal.SearchContext; import java.io.IOException; @@ -78,6 +79,7 @@ import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder; import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; /** * An aggregator of string values that relies on global ordinals in order to build buckets. @@ -94,6 +96,8 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr private final LongPredicate acceptedGlobalOrdinals; private final long valueCount; + private final String fieldName; + private Weight weight; private final GlobalOrdLookupFunction lookupGlobalOrd; protected final CollectionStrategy collectionStrategy; @@ -146,6 +150,7 @@ public GlobalOrdinalsStringTermsAggregator( return new DenseGlobalOrds(); }); } + this.fieldName = ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).indexFieldData.getFieldName(); } String descriptCollectionStrategy() { @@ -156,22 +161,80 @@ public void setWeight(Weight weight) { this.weight = weight; } - @Override - public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException { - if (weight != null && weight.getQuery() instanceof MatchAllDocsQuery) { - if ((weight.count(ctx) == 0) - && Terms.getTerms(ctx.reader(), String.valueOf(((WithOrdinals.FieldData) valuesSource).indexFieldData.getFieldName())) - .size() == 0) { - return LeafBucketCollector.NO_OP_COLLECTOR; - // } else if (weight.count(ctx) == ctx.reader().maxDoc() && weight.getQuery() instanceof MatchAllDocsQuery) { - // no deleted documents & top level query matches everything - // iterate over the terms - doc frequency for each termsEnum directly - // return appropriate LeafCollector + /** + Collects term frequencies for a given field from a LeafReaderContext. + @param ctx The LeafReaderContext to collect terms from + @param ords The SortedSetDocValues for the field's ordinals + @param ordCountConsumer A consumer to accept collected term frequencies + @return A LeafBucketCollector implementation that throws an exception, since collection is complete + @throws IOException If an I/O error occurs during reading */ + LeafBucketCollector termDocFreqCollector(LeafReaderContext ctx, SortedSetDocValues ords, BiConsumer ordCountConsumer) + throws IOException { + // long n0 = System.nanoTime(), n1, n2, n3, n4, n5 = 0; + if (weight.count(ctx) != ctx.reader().maxDoc()) { + // Top-level query does not match all docs in this segment. + return null; + } + // n1 = System.nanoTime(); + + Terms aggTerms = ctx.reader().terms(this.fieldName); + if (aggTerms == null) { + // Field is not indexed. + return null; + } + // n2 = System.nanoTime(); + NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME); + if (docCountValues.nextDoc() != NO_MORE_DOCS) { + // This segment has at least one document with the _doc_count field. + return null; + } + // n3 = System.nanoTime(); + TermsEnum indexTermsEnum = aggTerms.iterator(); + BytesRef indexTerm = indexTermsEnum.next(); + TermsEnum ordinalTermsEnum = ords.termsEnum(); + BytesRef ordinalTerm = ordinalTermsEnum.next(); + // n4 = System.nanoTime(); + while (indexTerm != null && ordinalTerm != null) { + int compare = indexTerm.compareTo(ordinalTerm); + if (compare == 0) { + if (acceptedGlobalOrdinals.test(ordinalTermsEnum.ord())) { + ordCountConsumer.accept(ordinalTermsEnum.ord(), indexTermsEnum.docFreq()); + } + indexTerm = indexTermsEnum.next(); + ordinalTerm = ordinalTermsEnum.next(); + } else if (compare < 0) { + indexTerm = indexTermsEnum.next(); + } else { + ordinalTerm = ordinalTermsEnum.next(); } + // n5 = System.nanoTime(); } + // logger.info((n1 - n0) + " " + (n2 - n1) + " " + (n3 - n2) + " " + (n4 - n3) + " " + (n5 - n4)); + // return new LeafBucketCollector() { + // @Override + // public void collect(int doc, long owningBucketOrd) { + // throw new CollectionTerminatedException(); + // } + // }; + return LeafBucketCollector.NO_OP_COLLECTOR; + } + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException { SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx); collectionStrategy.globalOrdsReady(globalOrds); + + if (collectionStrategy instanceof DenseGlobalOrds && sub == LeafBucketCollector.NO_OP_COLLECTOR) { + LeafBucketCollector termDocFreqCollector = termDocFreqCollector( + ctx, + globalOrds, + (o, c) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, o), c) + ); + if (termDocFreqCollector != null) { + return termDocFreqCollector; + } + } + SortedDocValues singleValues = DocValues.unwrapSingleton(globalOrds); if (singleValues != null) { segmentsWithSingleValuedOrds++; @@ -369,6 +432,16 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol final SortedSetDocValues segmentOrds = valuesSource.ordinalsValues(ctx); segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount()); assert sub == LeafBucketCollector.NO_OP_COLLECTOR; + + LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( + ctx, + segmentOrds, + (o, c) -> segmentDocCounts.increment(o + 1, c) + ); + if (termDocFreqCollector != null) { + return termDocFreqCollector; + } + final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds); mapping = valuesSource.globalOrdinalsMapping(ctx); // Dense mode doesn't support include/exclude so we don't have to check it here. From 0ed5c0d98d68dbfc43050b5b9b2dc5b7337ab537 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Wed, 24 Jan 2024 11:31:47 -0800 Subject: [PATCH 03/30] Minor refactoring - reverting making a field public Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 2 +- .../search/aggregations/support/ValuesSource.java | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 21008ba0e2083..870c6be2e037c 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -150,7 +150,7 @@ public GlobalOrdinalsStringTermsAggregator( return new DenseGlobalOrds(); }); } - this.fieldName = ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).indexFieldData.getFieldName(); + this.fieldName = ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName(); } String descriptCollectionStrategy() { diff --git a/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java b/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java index c3a5d445be7a1..1f4dd429e094e 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java +++ b/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java @@ -238,12 +238,16 @@ public long globalMaxOrd(IndexSearcher indexSearcher) throws IOException { */ public static class FieldData extends WithOrdinals { - public final IndexOrdinalsFieldData indexFieldData; + protected final IndexOrdinalsFieldData indexFieldData; public FieldData(IndexOrdinalsFieldData indexFieldData) { this.indexFieldData = indexFieldData; } + public String getIndexFieldName() { + return this.indexFieldData.getFieldName(); + } + @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) { final LeafOrdinalsFieldData atomicFieldData = indexFieldData.load(context); From 554e82d225c205978a40c1d3dd6d6991369ad0d2 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Wed, 24 Jan 2024 11:42:58 -0800 Subject: [PATCH 04/30] Fixing weight and declaring it a s final Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 870c6be2e037c..7652267c7a2eb 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -40,6 +40,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -98,7 +99,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr private final String fieldName; - private Weight weight; + private final Weight weight; private final GlobalOrdLookupFunction lookupGlobalOrd; protected final CollectionStrategy collectionStrategy; protected int segmentsWithSingleValuedOrds = 0; @@ -151,16 +152,13 @@ public GlobalOrdinalsStringTermsAggregator( }); } this.fieldName = ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName(); + this.weight = context.searcher().createWeight(context.query(), ScoreMode.COMPLETE_NO_SCORES, 1f); } String descriptCollectionStrategy() { return collectionStrategy.describe(); } - public void setWeight(Weight weight) { - this.weight = weight; - } - /** Collects term frequencies for a given field from a LeafReaderContext. @param ctx The LeafReaderContext to collect terms from From 050519a366a60d0cf026b4d77bd5a8097fd5062f Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Wed, 24 Jan 2024 16:21:03 -0800 Subject: [PATCH 05/30] Revert "Fixing weight and declaring it a s final" This reverts commit 851b7599d4735e910cc9cc37d9ab4eb8176b29a2. Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 7652267c7a2eb..870c6be2e037c 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -40,7 +40,6 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -99,7 +98,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr private final String fieldName; - private final Weight weight; + private Weight weight; private final GlobalOrdLookupFunction lookupGlobalOrd; protected final CollectionStrategy collectionStrategy; protected int segmentsWithSingleValuedOrds = 0; @@ -152,13 +151,16 @@ public GlobalOrdinalsStringTermsAggregator( }); } this.fieldName = ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName(); - this.weight = context.searcher().createWeight(context.query(), ScoreMode.COMPLETE_NO_SCORES, 1f); } String descriptCollectionStrategy() { return collectionStrategy.describe(); } + public void setWeight(Weight weight) { + this.weight = weight; + } + /** Collects term frequencies for a given field from a LeafReaderContext. @param ctx The LeafReaderContext to collect terms from From 7ed030b5a2ad4b6bbdf14a908d84dd2c55900453 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Wed, 24 Jan 2024 16:31:43 -0800 Subject: [PATCH 06/30] Bypassing doc iteration Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 58 ++++++++----------- .../search/internal/ContextIndexSearcher.java | 3 + 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 870c6be2e037c..7adcf6230b54e 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -75,7 +75,6 @@ import java.util.function.Function; import java.util.function.LongPredicate; import java.util.function.LongUnaryOperator; -import java.util.logging.Logger; import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder; import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; @@ -87,9 +86,6 @@ * @opensearch.internal */ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator { - - // testing only - will remove - protected Logger logger = Logger.getLogger(GlobalOrdinalsStringTermsAggregator.class.getName()); protected final ResultStrategy resultStrategy; protected final ValuesSource.Bytes.WithOrdinals valuesSource; @@ -164,58 +160,52 @@ public void setWeight(Weight weight) { /** Collects term frequencies for a given field from a LeafReaderContext. @param ctx The LeafReaderContext to collect terms from - @param ords The SortedSetDocValues for the field's ordinals + @param globalOrds The SortedSetDocValues for the field's ordinals @param ordCountConsumer A consumer to accept collected term frequencies - @return A LeafBucketCollector implementation that throws an exception, since collection is complete - @throws IOException If an I/O error occurs during reading */ - LeafBucketCollector termDocFreqCollector(LeafReaderContext ctx, SortedSetDocValues ords, BiConsumer ordCountConsumer) - throws IOException { - // long n0 = System.nanoTime(), n1, n2, n3, n4, n5 = 0; + @return A no-operation LeafBucketCollector implementation, since collection is complete + @throws IOException If an I/O error occurs during reading + */ + LeafBucketCollector termDocFreqCollector( + LeafReaderContext ctx, + SortedSetDocValues globalOrds, + BiConsumer ordCountConsumer + ) throws IOException { if (weight.count(ctx) != ctx.reader().maxDoc()) { // Top-level query does not match all docs in this segment. return null; } - // n1 = System.nanoTime(); - Terms aggTerms = ctx.reader().terms(this.fieldName); - if (aggTerms == null) { + Terms segmentTerms = ctx.reader().terms(this.fieldName); + if (segmentTerms == null) { // Field is not indexed. return null; } - // n2 = System.nanoTime(); + NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME); if (docCountValues.nextDoc() != NO_MORE_DOCS) { // This segment has at least one document with the _doc_count field. return null; } - // n3 = System.nanoTime(); - TermsEnum indexTermsEnum = aggTerms.iterator(); + + TermsEnum indexTermsEnum = segmentTerms.iterator(); BytesRef indexTerm = indexTermsEnum.next(); - TermsEnum ordinalTermsEnum = ords.termsEnum(); - BytesRef ordinalTerm = ordinalTermsEnum.next(); - // n4 = System.nanoTime(); + TermsEnum globalOrdinalTermsEnum = globalOrds.termsEnum(); + BytesRef ordinalTerm = globalOrdinalTermsEnum.next(); + while (indexTerm != null && ordinalTerm != null) { int compare = indexTerm.compareTo(ordinalTerm); if (compare == 0) { - if (acceptedGlobalOrdinals.test(ordinalTermsEnum.ord())) { - ordCountConsumer.accept(ordinalTermsEnum.ord(), indexTermsEnum.docFreq()); + if (acceptedGlobalOrdinals.test(globalOrdinalTermsEnum.ord())) { + ordCountConsumer.accept(globalOrdinalTermsEnum.ord(), indexTermsEnum.docFreq()); } indexTerm = indexTermsEnum.next(); - ordinalTerm = ordinalTermsEnum.next(); + ordinalTerm = globalOrdinalTermsEnum.next(); } else if (compare < 0) { indexTerm = indexTermsEnum.next(); } else { - ordinalTerm = ordinalTermsEnum.next(); + ordinalTerm = globalOrdinalTermsEnum.next(); } - // n5 = System.nanoTime(); } - // logger.info((n1 - n0) + " " + (n2 - n1) + " " + (n3 - n2) + " " + (n4 - n3) + " " + (n5 - n4)); - // return new LeafBucketCollector() { - // @Override - // public void collect(int doc, long owningBucketOrd) { - // throw new CollectionTerminatedException(); - // } - // }; return LeafBucketCollector.NO_OP_COLLECTOR; } @@ -228,10 +218,10 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol LeafBucketCollector termDocFreqCollector = termDocFreqCollector( ctx, globalOrds, - (o, c) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, o), c) + (ord, docCount) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, ord), docCount) ); if (termDocFreqCollector != null) { - return termDocFreqCollector; + return null; } } @@ -436,7 +426,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( ctx, segmentOrds, - (o, c) -> segmentDocCounts.increment(o + 1, c) + (ord, docCount) -> segmentDocCounts.increment(ord + 1, docCount) ); if (termDocFreqCollector != null) { return termDocFreqCollector; diff --git a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java index ec3ed2332d0b8..275a75c33406c 100644 --- a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java +++ b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java @@ -309,6 +309,9 @@ private void searchLeaf(LeafReaderContext ctx, Weight weight, Collector collecto // See please https://github.com/apache/lucene/pull/964 collector.setWeight(weight); leafCollector = collector.getLeafCollector(ctx); + if (leafCollector == null) { + return; + } } catch (CollectionTerminatedException e) { // there is no doc of interest in this reader context // continue with the following leaf From d7db848c4092b4ae0368aa268498288f918a9564 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Fri, 2 Feb 2024 16:23:33 -0800 Subject: [PATCH 07/30] Refactoring and changelog addition Signed-off-by: Sandesh Kumar --- CHANGELOG.md | 1 + .../terms/GlobalOrdinalsStringTermsAggregator.java | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e7229a2a0336..2be1d8eda34f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -128,6 +128,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - Allow composite aggregation to run under a parent filter aggregation ([#11499](https://github.com/opensearch-project/OpenSearch/pull/11499)) +- Improve string terms aggregation performance using Collector#setWeight ([#11643](https://github.com/opensearch-project/OpenSearch/pull/11643)) ### Deprecated diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 7adcf6230b54e..641fab5a2e5da 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -40,6 +40,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -92,7 +93,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr private final LongPredicate acceptedGlobalOrdinals; private final long valueCount; - private final String fieldName; + private String fieldName; private Weight weight; private final GlobalOrdLookupFunction lookupGlobalOrd; @@ -146,7 +147,8 @@ public GlobalOrdinalsStringTermsAggregator( return new DenseGlobalOrds(); }); } - this.fieldName = ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName(); + this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData) ? + ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName() : null; } String descriptCollectionStrategy() { @@ -170,6 +172,10 @@ LeafBucketCollector termDocFreqCollector( SortedSetDocValues globalOrds, BiConsumer ordCountConsumer ) throws IOException { + if (weight == null) { + // Calculate weight if not assigned previously + this.weight = context.searcher().createWeight(context.query(), ScoreMode.COMPLETE_NO_SCORES, 1f); + } if (weight.count(ctx) != ctx.reader().maxDoc()) { // Top-level query does not match all docs in this segment. return null; @@ -221,7 +227,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol (ord, docCount) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, ord), docCount) ); if (termDocFreqCollector != null) { - return null; + return termDocFreqCollector; } } From 7aa424ae6be61ef1420beaf09f75e586215be024 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Fri, 2 Feb 2024 16:28:33 -0800 Subject: [PATCH 08/30] Add case for weight.count == 0 Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 641fab5a2e5da..671631be0b547 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -176,6 +176,12 @@ LeafBucketCollector termDocFreqCollector( // Calculate weight if not assigned previously this.weight = context.searcher().createWeight(context.query(), ScoreMode.COMPLETE_NO_SCORES, 1f); } + + if (weight.count(ctx) == 0) { + // No documents matches top level query on this segment, we can skip it + return LeafBucketCollector.NO_OP_COLLECTOR; + } + if (weight.count(ctx) != ctx.reader().maxDoc()) { // Top-level query does not match all docs in this segment. return null; From d52c57848193057ecb942d928ee71519d3123768 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Fri, 2 Feb 2024 16:32:34 -0800 Subject: [PATCH 09/30] minor refactoring Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 4 +--- .../org/opensearch/search/internal/ContextIndexSearcher.java | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 671631be0b547..aa52a4b2d763b 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -92,9 +92,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr private final LongPredicate acceptedGlobalOrdinals; private final long valueCount; - - private String fieldName; - + private final String fieldName; private Weight weight; private final GlobalOrdLookupFunction lookupGlobalOrd; protected final CollectionStrategy collectionStrategy; diff --git a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java index 275a75c33406c..ec3ed2332d0b8 100644 --- a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java +++ b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java @@ -309,9 +309,6 @@ private void searchLeaf(LeafReaderContext ctx, Weight weight, Collector collecto // See please https://github.com/apache/lucene/pull/964 collector.setWeight(weight); leafCollector = collector.getLeafCollector(ctx); - if (leafCollector == null) { - return; - } } catch (CollectionTerminatedException e) { // there is no doc of interest in this reader context // continue with the following leaf From f5b7643a68facd3017952c4f0e1b73d18509fba5 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Fri, 2 Feb 2024 16:58:45 -0800 Subject: [PATCH 10/30] fixing weight null check Signed-off-by: Sandesh Kumar --- .../terms/GlobalOrdinalsStringTermsAggregator.java | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index aa52a4b2d763b..d81a11c7cf7d3 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -170,21 +170,16 @@ LeafBucketCollector termDocFreqCollector( SortedSetDocValues globalOrds, BiConsumer ordCountConsumer ) throws IOException { - if (weight == null) { - // Calculate weight if not assigned previously - this.weight = context.searcher().createWeight(context.query(), ScoreMode.COMPLETE_NO_SCORES, 1f); + if (weight == null || weight.count(ctx) != ctx.reader().maxDoc()) { + // Weight not assigned or top-level query does not match all docs in the segment. + return null; } if (weight.count(ctx) == 0) { - // No documents matches top level query on this segment, we can skip it + // No documents matches top level query on this segment, we can skip the segment return LeafBucketCollector.NO_OP_COLLECTOR; } - if (weight.count(ctx) != ctx.reader().maxDoc()) { - // Top-level query does not match all docs in this segment. - return null; - } - Terms segmentTerms = ctx.reader().terms(this.fieldName); if (segmentTerms == null) { // Field is not indexed. From 70571a7586f4151f5acd621ec49ce3cf83747e08 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Mon, 5 Feb 2024 12:25:42 -0800 Subject: [PATCH 11/30] spotless apply Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index d81a11c7cf7d3..320c684d933db 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -40,7 +40,6 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -145,8 +144,9 @@ public GlobalOrdinalsStringTermsAggregator( return new DenseGlobalOrds(); }); } - this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData) ? - ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName() : null; + this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData) + ? ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName() + : null; } String descriptCollectionStrategy() { From 3e8f96d6f6c824a06fa6a69fcaa783b285f42f05 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Mon, 5 Feb 2024 14:28:35 -0800 Subject: [PATCH 12/30] Lowcardinality case rectified - was not collecing buckets correctly Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 320c684d933db..014460837c491 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -428,11 +428,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount()); assert sub == LeafBucketCollector.NO_OP_COLLECTOR; - LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( - ctx, - segmentOrds, - (ord, docCount) -> segmentDocCounts.increment(ord + 1, docCount) - ); + LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector(ctx, segmentOrds, this::incrementBucketDocCount); if (termDocFreqCollector != null) { return termDocFreqCollector; } From 98a33cdf21d490fd5e34f40e2d5ebab36f28836a Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 8 Feb 2024 13:29:54 -0800 Subject: [PATCH 13/30] Fix test failures for termintate_early cases Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 3 +- .../search/query/QueryPhaseTests.java | 28 +++++++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 014460837c491..bbf5b05396a48 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -171,7 +171,8 @@ LeafBucketCollector termDocFreqCollector( BiConsumer ordCountConsumer ) throws IOException { if (weight == null || weight.count(ctx) != ctx.reader().maxDoc()) { - // Weight not assigned or top-level query does not match all docs in the segment. + // weight.count(ctx) == ctx.reader().maxDoc() implies there are no deleted documents and + // top-level query matches all docs in the segment return null; } diff --git a/server/src/test/java/org/opensearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/opensearch/search/query/QueryPhaseTests.java index d0e01c5461c79..4bd4d406e4391 100644 --- a/server/src/test/java/org/opensearch/search/query/QueryPhaseTests.java +++ b/server/src/test/java/org/opensearch/search/query/QueryPhaseTests.java @@ -122,6 +122,7 @@ import java.util.concurrent.TimeUnit; import static org.opensearch.search.query.TopDocsCollectorContext.hasInfMaxScore; +import static org.hamcrest.Matchers.allOf; import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; @@ -437,10 +438,16 @@ public void testTerminateAfterEarlyTermination() throws Exception { assertThat(context.queryResult().topDocs().topDocs.totalHits.value, equalTo(1L)); assertThat(context.queryResult().topDocs().topDocs.scoreDocs.length, equalTo(1)); + // Do not expect an exact match when terminate_after is used in conjunction to size = 0 as an optimization introduced by + // https://issues.apache.org/jira/browse/LUCENE-10620 can produce a total hit count >= terminated_after, because + // TotalHitCountCollector is used in this case as part of Weight#count() optimization context.setSize(0); QueryPhase.executeInternal(context.withCleanQueryResult(), queryPhaseSearcher); assertTrue(context.queryResult().terminatedEarly()); - assertThat(context.queryResult().topDocs().topDocs.totalHits.value, equalTo(1L)); + assertThat( + context.queryResult().topDocs().topDocs.totalHits.value, + allOf(greaterThanOrEqualTo(1L), lessThanOrEqualTo((long) numDocs)) + ); assertThat(context.queryResult().topDocs().topDocs.scoreDocs.length, equalTo(0)); } @@ -466,7 +473,10 @@ public void testTerminateAfterEarlyTermination() throws Exception { context.parsedQuery(new ParsedQuery(bq)); QueryPhase.executeInternal(context.withCleanQueryResult(), queryPhaseSearcher); assertTrue(context.queryResult().terminatedEarly()); - assertThat(context.queryResult().topDocs().topDocs.totalHits.value, equalTo(1L)); + assertThat( + context.queryResult().topDocs().topDocs.totalHits.value, + allOf(greaterThanOrEqualTo(1L), lessThanOrEqualTo((long) numDocs)) + ); assertThat(context.queryResult().topDocs().topDocs.scoreDocs.length, equalTo(0)); } { @@ -486,9 +496,12 @@ public void testTerminateAfterEarlyTermination() throws Exception { context.queryCollectorManagers().put(TotalHitCountCollector.class, manager); QueryPhase.executeInternal(context.withCleanQueryResult(), queryPhaseSearcher); assertTrue(context.queryResult().terminatedEarly()); - assertThat(context.queryResult().topDocs().topDocs.totalHits.value, equalTo(1L)); + assertThat( + context.queryResult().topDocs().topDocs.totalHits.value, + allOf(greaterThanOrEqualTo(1L), lessThanOrEqualTo((long) numDocs)) + ); assertThat(context.queryResult().topDocs().topDocs.scoreDocs.length, equalTo(0)); - assertThat(manager.getTotalHits(), equalTo(1)); + assertThat(manager.getTotalHits(), allOf(greaterThanOrEqualTo(1), lessThanOrEqualTo(numDocs))); } // tests with trackTotalHits and terminateAfter @@ -503,7 +516,10 @@ public void testTerminateAfterEarlyTermination() throws Exception { if (trackTotalHits == -1) { assertThat(context.queryResult().topDocs().topDocs.totalHits.value, equalTo(0L)); } else { - assertThat(context.queryResult().topDocs().topDocs.totalHits.value, equalTo((long) Math.min(trackTotalHits, 10))); + assertThat( + context.queryResult().topDocs().topDocs.totalHits.value, + allOf(greaterThanOrEqualTo(Math.min(trackTotalHits, 10L)), lessThanOrEqualTo((long) numDocs)) + ); } assertThat(context.queryResult().topDocs().topDocs.scoreDocs.length, equalTo(0)); // The concurrent search terminates the collection when the number of hits is reached by each @@ -511,7 +527,7 @@ public void testTerminateAfterEarlyTermination() throws Exception { // slices (as the unit of concurrency). To address that, we have to use the shared global state, // much as HitsThresholdChecker does. if (executor == null) { - assertThat(manager.getTotalHits(), equalTo(10)); + assertThat(manager.getTotalHits(), allOf(greaterThanOrEqualTo(Math.min(trackTotalHits, 10)), lessThanOrEqualTo(numDocs))); } } From 85e9fe285f2bffa590eaae7533de301c0e6629d6 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Wed, 14 Feb 2024 12:01:26 -0800 Subject: [PATCH 14/30] Optimize only for Standard Term Results Signed-off-by: Sandesh Kumar --- .../terms/GlobalOrdinalsStringTermsAggregator.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index bbf5b05396a48..0544a73a8ea41 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -220,7 +220,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx); collectionStrategy.globalOrdsReady(globalOrds); - if (collectionStrategy instanceof DenseGlobalOrds && sub == LeafBucketCollector.NO_OP_COLLECTOR) { + if (collectionStrategy instanceof DenseGlobalOrds && this.resultStrategy instanceof StandardTermsResults && sub == LeafBucketCollector.NO_OP_COLLECTOR) { LeafBucketCollector termDocFreqCollector = termDocFreqCollector( ctx, globalOrds, @@ -429,9 +429,12 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount()); assert sub == LeafBucketCollector.NO_OP_COLLECTOR; - LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector(ctx, segmentOrds, this::incrementBucketDocCount); - if (termDocFreqCollector != null) { - return termDocFreqCollector; + + if (this.resultStrategy instanceof StandardTermsResults) { + LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector(ctx, segmentOrds, this::incrementBucketDocCount); + if (termDocFreqCollector != null) { + return termDocFreqCollector; + } } final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds); From 2673a1fc612c83222232205779040e2db87feb06 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Wed, 14 Feb 2024 12:05:51 -0800 Subject: [PATCH 15/30] spotless fix Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 0544a73a8ea41..a9a4ab2157826 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -220,7 +220,9 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx); collectionStrategy.globalOrdsReady(globalOrds); - if (collectionStrategy instanceof DenseGlobalOrds && this.resultStrategy instanceof StandardTermsResults && sub == LeafBucketCollector.NO_OP_COLLECTOR) { + if (collectionStrategy instanceof DenseGlobalOrds + && this.resultStrategy instanceof StandardTermsResults + && sub == LeafBucketCollector.NO_OP_COLLECTOR) { LeafBucketCollector termDocFreqCollector = termDocFreqCollector( ctx, globalOrds, @@ -429,7 +431,6 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount()); assert sub == LeafBucketCollector.NO_OP_COLLECTOR; - if (this.resultStrategy instanceof StandardTermsResults) { LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector(ctx, segmentOrds, this::incrementBucketDocCount); if (termDocFreqCollector != null) { From ab3f9f940fec24303fbf5efd5f12621ac234cd0a Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 15 Feb 2024 13:51:10 -0800 Subject: [PATCH 16/30] fix low cardinality case Signed-off-by: Sandesh Kumar --- .../terms/GlobalOrdinalsStringTermsAggregator.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index a9a4ab2157826..a69e55181fcd2 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -430,17 +430,23 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol final SortedSetDocValues segmentOrds = valuesSource.ordinalsValues(ctx); segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount()); assert sub == LeafBucketCollector.NO_OP_COLLECTOR; + mapping = valuesSource.globalOrdinalsMapping(ctx); if (this.resultStrategy instanceof StandardTermsResults) { - LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector(ctx, segmentOrds, this::incrementBucketDocCount); + LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( + ctx, + segmentOrds, + (ord, docCount) -> incrementBucketDocCount( + collectionStrategy.globalOrdToBucketOrd(0, mapping.applyAsLong(ord)), + docCount + ) + ); if (termDocFreqCollector != null) { return termDocFreqCollector; } } final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds); - mapping = valuesSource.globalOrdinalsMapping(ctx); - // Dense mode doesn't support include/exclude so we don't have to check it here. if (singleValues != null) { segmentsWithSingleValuedOrds++; return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, segmentOrds) { From 52fd9483548ef028b13787f9ae9557dd02b0e9f6 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 15 Feb 2024 14:52:05 -0800 Subject: [PATCH 17/30] Remove extra checks Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index a69e55181fcd2..c5bc7caa718a0 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -220,9 +220,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx); collectionStrategy.globalOrdsReady(globalOrds); - if (collectionStrategy instanceof DenseGlobalOrds - && this.resultStrategy instanceof StandardTermsResults - && sub == LeafBucketCollector.NO_OP_COLLECTOR) { + if (collectionStrategy instanceof DenseGlobalOrds && sub == LeafBucketCollector.NO_OP_COLLECTOR) { LeafBucketCollector termDocFreqCollector = termDocFreqCollector( ctx, globalOrds, @@ -432,18 +430,13 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol assert sub == LeafBucketCollector.NO_OP_COLLECTOR; mapping = valuesSource.globalOrdinalsMapping(ctx); - if (this.resultStrategy instanceof StandardTermsResults) { - LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( - ctx, - segmentOrds, - (ord, docCount) -> incrementBucketDocCount( - collectionStrategy.globalOrdToBucketOrd(0, mapping.applyAsLong(ord)), - docCount - ) - ); - if (termDocFreqCollector != null) { - return termDocFreqCollector; - } + LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( + ctx, + segmentOrds, + (ord, docCount) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, mapping.applyAsLong(ord)), docCount) + ); + if (termDocFreqCollector != null) { + return termDocFreqCollector; } final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds); From ebf4ce58b3ec6dadc29bb4f3588e80c93a040a2f Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 15 Feb 2024 15:21:53 -0800 Subject: [PATCH 18/30] Minor optimization Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index c5bc7caa718a0..70fbe30b0a558 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -433,7 +433,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( ctx, segmentOrds, - (ord, docCount) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, mapping.applyAsLong(ord)), docCount) + (ord, docCount) -> incrementBucketDocCount(mapping.applyAsLong(ord), docCount) ); if (termDocFreqCollector != null) { return termDocFreqCollector; From b14c03e3df61894e29f60d0241e5e82c1c339b90 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 15 Feb 2024 18:07:09 -0800 Subject: [PATCH 19/30] Revert "Minor optimization" This reverts commit c17f93a216484d03049d89df3cbdd81b39f58e67. Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 70fbe30b0a558..c5bc7caa718a0 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -433,7 +433,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( ctx, segmentOrds, - (ord, docCount) -> incrementBucketDocCount(mapping.applyAsLong(ord), docCount) + (ord, docCount) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, mapping.applyAsLong(ord)), docCount) ); if (termDocFreqCollector != null) { return termDocFreqCollector; From 318e4ea1cd79d289ac03b1c257b68cbcc2eccb71 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Fri, 16 Feb 2024 00:04:42 -0800 Subject: [PATCH 20/30] fix significant terms aggregator condition Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index c5bc7caa718a0..39aae29a36e5f 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -220,7 +220,9 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx); collectionStrategy.globalOrdsReady(globalOrds); - if (collectionStrategy instanceof DenseGlobalOrds && sub == LeafBucketCollector.NO_OP_COLLECTOR) { + if (collectionStrategy instanceof DenseGlobalOrds + && this.resultStrategy instanceof StandardTermsResults + && sub == LeafBucketCollector.NO_OP_COLLECTOR) { LeafBucketCollector termDocFreqCollector = termDocFreqCollector( ctx, globalOrds, @@ -430,13 +432,15 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol assert sub == LeafBucketCollector.NO_OP_COLLECTOR; mapping = valuesSource.globalOrdinalsMapping(ctx); - LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( - ctx, - segmentOrds, - (ord, docCount) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, mapping.applyAsLong(ord)), docCount) - ); - if (termDocFreqCollector != null) { - return termDocFreqCollector; + if (this.resultStrategy instanceof StandardTermsResults) { + LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector( + ctx, + segmentOrds, + (ord, docCount) -> incrementBucketDocCount(mapping.applyAsLong(ord), docCount) + ); + if (termDocFreqCollector != null) { + return termDocFreqCollector; + } } final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds); From 1c7b400ec3e1a2e28a154667d901ffb526e46d1b Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 22 Feb 2024 17:26:54 -0800 Subject: [PATCH 21/30] Add UTs for optimization Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 1 + .../bucket/terms/TermsAggregatorTests.java | 137 +++++++++++++----- 2 files changed, 103 insertions(+), 35 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 39aae29a36e5f..bb85344d5c4ee 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -248,6 +248,7 @@ public void collect(int doc, long owningBucketOrd) throws IOException { return; } int globalOrd = singleValues.ordValue(); + //Hello collectionStrategy.collectGlobalOrd(owningBucketOrd, doc, globalOrd, sub); } }); diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java index 80744ecde4d69..c2d27fbf3600e 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java @@ -52,6 +52,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; +import org.opensearch.common.TriConsumer; import org.opensearch.common.geo.GeoPoint; import org.opensearch.common.network.InetAddresses; import org.opensearch.common.settings.Settings; @@ -143,6 +144,18 @@ public class TermsAggregatorTests extends AggregatorTestCase { private static final String STRING_SCRIPT_NAME = "string_script"; private static final String STRING_SCRIPT_OUTPUT = "Orange"; + private static final Consumer DEFAULT_POST_COLLECTION = termsAggregator -> { + try { + termsAggregator.postCollection(); + } catch (IOException e) { + throw new RuntimeException(e); + } + }; + + // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used. + // using NOOP_POST_COLLECTION_CONSUMER ensures that the bucket count in aggregation is completed before/without running postCollection() + private static final Consumer NOOP_POST_COLLECTION_CONSUMER = termsAggregator -> {}; + @Override protected MapperService mapperServiceMock() { MapperService mapperService = mock(MapperService.class); @@ -257,24 +270,54 @@ public void testUsesGlobalOrdinalsByDefault() throws Exception { directory.close(); } - public void testSimple() throws Exception { + /** + * This test case utilizes the low cardinality implementation of GlobalOrdinalsStringTermsAggregator. + * In this case, the segment terms will not get initialized and will run without LeafBucketCollector#termDocFreqCollector optimization + */ + public void testSimpleAggregation() throws Exception { + testSimple( + (document, field, value) -> document.add(new SortedSetDocValuesField(field, new BytesRef(value))), + DEFAULT_POST_COLLECTION + ); + } + + /** + * This test case utilizes the low cardinality implementation of GlobalOrdinalsStringTermsAggregator. + * In this case, the segment terms will get initialized and will use LeafBucketCollector#termDocFreqCollector optimization + */ + public void testSimpleAggregationWithStoredValues() throws Exception { + // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used. + // This also verifies that the bucket count is completed without running postCollection() + testSimple((document, field, value) -> { + document.add(new SortedSetDocValuesField(field, new BytesRef(value))); + document.add(new StringField(field, value, Field.Store.NO)); + }, NOOP_POST_COLLECTION_CONSUMER); + + } + + /** + * This is a utility method to test out string terms aggregation + * @param addFieldConsumer a function that determines how a field is added to the document + */ + private void testSimple(TriConsumer addFieldConsumer, Consumer postCollectionConsumer) + throws Exception { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); - document.add(new SortedSetDocValuesField("string", new BytesRef("a"))); - document.add(new SortedSetDocValuesField("string", new BytesRef("b"))); + addFieldConsumer.apply(document, "string", "a"); + addFieldConsumer.apply(document, "string", "b"); indexWriter.addDocument(document); document = new Document(); - document.add(new SortedSetDocValuesField("string", new BytesRef(""))); - document.add(new SortedSetDocValuesField("string", new BytesRef("c"))); - document.add(new SortedSetDocValuesField("string", new BytesRef("a"))); + addFieldConsumer.apply(document, "string", ""); + addFieldConsumer.apply(document, "string", "c"); + addFieldConsumer.apply(document, "string", "a"); indexWriter.addDocument(document); document = new Document(); - document.add(new SortedSetDocValuesField("string", new BytesRef("b"))); - document.add(new SortedSetDocValuesField("string", new BytesRef("d"))); + addFieldConsumer.apply(document, "string", "b"); + addFieldConsumer.apply(document, "string", "d"); indexWriter.addDocument(document); document = new Document(); - document.add(new SortedSetDocValuesField("string", new BytesRef(""))); + addFieldConsumer.apply(document, "string", ""); indexWriter.addDocument(document); try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { IndexSearcher indexSearcher = newIndexSearcher(indexReader); @@ -287,7 +330,7 @@ public void testSimple() throws Exception { TermsAggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - aggregator.postCollection(); + postCollectionConsumer.accept(aggregator); Terms result = reduce(aggregator); assertEquals(5, result.getBuckets().size()); assertEquals("", result.getBuckets().get(0).getKeyAsString()); @@ -307,38 +350,63 @@ public void testSimple() throws Exception { } } + /** + * This test case utilizes the default implementation of GlobalOrdinalsStringTermsAggregator. + * In this case, the segment terms will not get initialized and will run without LeafBucketCollector#termDocFreqCollector optimization + */ public void testStringIncludeExclude() throws Exception { + testStringIncludeExclude( + (document, field, value) -> document.add(new SortedSetDocValuesField(field, new BytesRef(value))), + DEFAULT_POST_COLLECTION + ); + } + + /** + * This test case utilizes the default implementation of GlobalOrdinalsStringTermsAggregator. + * In this case, the segment terms will get initialized and will use LeafBucketCollector#termDocFreqCollector optimization + */ + public void testStringIncludeExcludeWithStoredValues() throws Exception { + // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used + // This also verifies that the bucket count is completed without running postCollection() + testStringIncludeExclude((document, field, value) -> { + document.add(new SortedSetDocValuesField(field, new BytesRef(value))); + document.add(new StringField(field, value, Field.Store.NO)); + }, NOOP_POST_COLLECTION_CONSUMER); + } + + private void testStringIncludeExclude(TriConsumer addField, Consumer postCollectionConsumer) + throws Exception { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val000"))); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val001"))); - document.add(new SortedDocValuesField("sv_field", new BytesRef("val001"))); + addField.apply(document, "mv_field", "val000"); + addField.apply(document, "mv_field", "val001"); + addField.apply(document, "sv_field", "val001"); indexWriter.addDocument(document); document = new Document(); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val002"))); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val003"))); - document.add(new SortedDocValuesField("sv_field", new BytesRef("val003"))); + addField.apply(document, "mv_field", "val002"); + addField.apply(document, "mv_field", "val003"); + addField.apply(document, "sv_field", "val003"); indexWriter.addDocument(document); document = new Document(); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val004"))); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val005"))); - document.add(new SortedDocValuesField("sv_field", new BytesRef("val005"))); + addField.apply(document, "mv_field", "val004"); + addField.apply(document, "mv_field", "val005"); + addField.apply(document, "sv_field", "val005"); indexWriter.addDocument(document); document = new Document(); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val006"))); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val007"))); - document.add(new SortedDocValuesField("sv_field", new BytesRef("val007"))); + addField.apply(document, "mv_field", "val006"); + addField.apply(document, "mv_field", "val007"); + addField.apply(document, "sv_field", "val007"); indexWriter.addDocument(document); document = new Document(); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val008"))); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val009"))); - document.add(new SortedDocValuesField("sv_field", new BytesRef("val009"))); + addField.apply(document, "mv_field", "val008"); + addField.apply(document, "mv_field", "val009"); + addField.apply(document, "sv_field", "val009"); indexWriter.addDocument(document); document = new Document(); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val010"))); - document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val011"))); - document.add(new SortedDocValuesField("sv_field", new BytesRef("val011"))); + addField.apply(document, "mv_field", "val010"); + addField.apply(document, "mv_field", "val011"); + addField.apply(document, "sv_field", "val011"); indexWriter.addDocument(document); try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { IndexSearcher indexSearcher = newIndexSearcher(indexReader); @@ -355,7 +423,7 @@ public void testStringIncludeExclude() throws Exception { TermsAggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - aggregator.postCollection(); + postCollectionConsumer.accept(aggregator); Terms result = reduce(aggregator); assertEquals(10, result.getBuckets().size()); assertEquals("val000", result.getBuckets().get(0).getKeyAsString()); @@ -390,7 +458,7 @@ public void testStringIncludeExclude() throws Exception { aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType2); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - aggregator.postCollection(); + postCollectionConsumer.accept(aggregator); result = reduce(aggregator); assertEquals(5, result.getBuckets().size()); assertEquals("val001", result.getBuckets().get(0).getKeyAsString()); @@ -414,7 +482,7 @@ public void testStringIncludeExclude() throws Exception { aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - aggregator.postCollection(); + postCollectionConsumer.accept(aggregator); result = reduce(aggregator); assertEquals(8, result.getBuckets().size()); assertEquals("val002", result.getBuckets().get(0).getKeyAsString()); @@ -443,7 +511,7 @@ public void testStringIncludeExclude() throws Exception { aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - aggregator.postCollection(); + postCollectionConsumer.accept(aggregator); result = reduce(aggregator); assertEquals(2, result.getBuckets().size()); assertEquals("val010", result.getBuckets().get(0).getKeyAsString()); @@ -460,7 +528,7 @@ public void testStringIncludeExclude() throws Exception { aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - aggregator.postCollection(); + postCollectionConsumer.accept(aggregator); result = reduce(aggregator); assertEquals(2, result.getBuckets().size()); assertEquals("val000", result.getBuckets().get(0).getKeyAsString()); @@ -492,7 +560,7 @@ public void testStringIncludeExclude() throws Exception { aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - aggregator.postCollection(); + postCollectionConsumer.accept(aggregator); result = reduce(aggregator); assertEquals(2, result.getBuckets().size()); assertEquals("val000", result.getBuckets().get(0).getKeyAsString()); @@ -1543,5 +1611,4 @@ private T reduce(Aggregator agg) throws IOExcept doAssertReducedMultiBucketConsumer(result, reduceBucketConsumer); return result; } - } From f3187ea1c1fe40efa9a2054000aefbca03ef9b43 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 22 Feb 2024 17:56:15 -0800 Subject: [PATCH 22/30] invalid comment correction Signed-off-by: Sandesh Kumar --- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index bb85344d5c4ee..39aae29a36e5f 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -248,7 +248,6 @@ public void collect(int doc, long owningBucketOrd) throws IOException { return; } int globalOrd = singleValues.ordValue(); - //Hello collectionStrategy.collectGlobalOrd(owningBucketOrd, doc, globalOrd, sub); } }); From 6b70df55873c1f452678f550296b53cf95874859 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 22 Feb 2024 22:21:28 -0800 Subject: [PATCH 23/30] refactor Signed-off-by: Sandesh Kumar --- .../search/aggregations/bucket/terms/TermsAggregatorTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java index c2d27fbf3600e..51dc53bba4f02 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java @@ -154,7 +154,7 @@ public class TermsAggregatorTests extends AggregatorTestCase { // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used. // using NOOP_POST_COLLECTION_CONSUMER ensures that the bucket count in aggregation is completed before/without running postCollection() - private static final Consumer NOOP_POST_COLLECTION_CONSUMER = termsAggregator -> {}; + private static final Consumer NOOP_POST_COLLECTION = termsAggregator -> {}; @Override protected MapperService mapperServiceMock() { From ca820f05bdc479a8f3a5f35f929b5e1e789d720b Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Thu, 22 Feb 2024 22:36:02 -0800 Subject: [PATCH 24/30] typo fix Signed-off-by: Sandesh Kumar --- .../aggregations/bucket/terms/TermsAggregatorTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java index 51dc53bba4f02..91fc3b54faa3f 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java @@ -291,7 +291,7 @@ public void testSimpleAggregationWithStoredValues() throws Exception { testSimple((document, field, value) -> { document.add(new SortedSetDocValuesField(field, new BytesRef(value))); document.add(new StringField(field, value, Field.Store.NO)); - }, NOOP_POST_COLLECTION_CONSUMER); + }, NOOP_POST_COLLECTION); } @@ -371,7 +371,7 @@ public void testStringIncludeExcludeWithStoredValues() throws Exception { testStringIncludeExclude((document, field, value) -> { document.add(new SortedSetDocValuesField(field, new BytesRef(value))); document.add(new StringField(field, value, Field.Store.NO)); - }, NOOP_POST_COLLECTION_CONSUMER); + }, NOOP_POST_COLLECTION); } private void testStringIncludeExclude(TriConsumer addField, Consumer postCollectionConsumer) From 9a82b348ea9ae2b47bff8d9dc3aa1e8455722dfe Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Mon, 26 Feb 2024 15:44:18 -0800 Subject: [PATCH 25/30] Increase code coverage and minor refactoring Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 23 +++-- .../terms/KeywordTermsAggregatorTests.java | 95 ++++++++++++++----- .../bucket/terms/TermsAggregatorTests.java | 30 ++---- .../aggregations/AggregatorTestCase.java | 50 +++++++++- 4 files changed, 137 insertions(+), 61 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 39aae29a36e5f..2a83710117e74 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -158,7 +158,7 @@ public void setWeight(Weight weight) { } /** - Collects term frequencies for a given field from a LeafReaderContext. + Collects term frequencies for a given field from a LeafReaderContext directly from stored segment terms @param ctx The LeafReaderContext to collect terms from @param globalOrds The SortedSetDocValues for the field's ordinals @param ordCountConsumer A consumer to accept collected term frequencies @@ -170,15 +170,18 @@ LeafBucketCollector termDocFreqCollector( SortedSetDocValues globalOrds, BiConsumer ordCountConsumer ) throws IOException { - if (weight == null || weight.count(ctx) != ctx.reader().maxDoc()) { - // weight.count(ctx) == ctx.reader().maxDoc() implies there are no deleted documents and - // top-level query matches all docs in the segment + if (weight == null) { + // Weight not assigned - cannot use this optimization return null; - } - - if (weight.count(ctx) == 0) { - // No documents matches top level query on this segment, we can skip the segment - return LeafBucketCollector.NO_OP_COLLECTOR; + } else { + if (weight.count(ctx) == 0) { + // No documents matches top level query on this segment, we can skip the segment entirely + return LeafBucketCollector.NO_OP_COLLECTOR; + } else if (weight.count(ctx) != ctx.reader().maxDoc()) { + // weight.count(ctx) == ctx.reader().maxDoc() implies there are no deleted documents and + // top-level query matches all docs in the segment + return null; + } } Terms segmentTerms = ctx.reader().terms(this.fieldName); @@ -198,6 +201,8 @@ LeafBucketCollector termDocFreqCollector( TermsEnum globalOrdinalTermsEnum = globalOrds.termsEnum(); BytesRef ordinalTerm = globalOrdinalTermsEnum.next(); + // Iterate over the terms in the segment, look for matches in the global ordinal terms, + // and increment bucket count when segment terms match global ordinal terms. while (indexTerm != null && ordinalTerm != null) { int compare = indexTerm.compareTo(ordinalTerm); if (compare == 0) { diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java index 4229361aa7f46..5d1e02116f189 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java @@ -42,8 +42,10 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.util.BytesRef; +import org.opensearch.common.TriConsumer; import org.opensearch.index.mapper.KeywordFieldMapper; import org.opensearch.index.mapper.MappedFieldType; +import org.opensearch.search.aggregations.Aggregator; import org.opensearch.search.aggregations.AggregatorTestCase; import org.opensearch.search.aggregations.support.ValueType; @@ -68,61 +70,103 @@ public class KeywordTermsAggregatorTests extends AggregatorTestCase { dataset = d; } + private static Consumer VERIFY_MATCH_ALL_DOCS = agg -> { + assertEquals(9, agg.getBuckets().size()); + for (int i = 0; i < 9; i++) { + StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(i); + assertThat(bucket.getKey(), equalTo(String.valueOf(9L - i))); + assertThat(bucket.getDocCount(), equalTo(9L - i)); + } + }; + + private static Query MATCH_ALL_DOCS_QUERY = new MatchAllDocsQuery(); + + private static Query MATCH_NO_DOCS_QUERY = new MatchNoDocsQuery(); + public void testMatchNoDocs() throws IOException { testSearchCase( - new MatchNoDocsQuery(), + ADD_SORTED_FIELD_NO_STORE, + MATCH_NO_DOCS_QUERY, dataset, aggregation -> aggregation.field(KEYWORD_FIELD), agg -> assertEquals(0, agg.getBuckets().size()), - null // without type hint + null, // without type hint + DEFAULT_POST_COLLECTION ); testSearchCase( - new MatchNoDocsQuery(), + ADD_SORTED_FIELD_NO_STORE, + MATCH_NO_DOCS_QUERY, dataset, aggregation -> aggregation.field(KEYWORD_FIELD), agg -> assertEquals(0, agg.getBuckets().size()), - ValueType.STRING // with type hint + ValueType.STRING, // with type hint + DEFAULT_POST_COLLECTION ); } public void testMatchAllDocs() throws IOException { - Query query = new MatchAllDocsQuery(); - - testSearchCase(query, dataset, aggregation -> aggregation.field(KEYWORD_FIELD), agg -> { - assertEquals(9, agg.getBuckets().size()); - for (int i = 0; i < 9; i++) { - StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(i); - assertThat(bucket.getKey(), equalTo(String.valueOf(9L - i))); - assertThat(bucket.getDocCount(), equalTo(9L - i)); - } - }, - null // without type hint + testSearchCase( + ADD_SORTED_FIELD_NO_STORE, + MATCH_ALL_DOCS_QUERY, + dataset, + aggregation -> aggregation.field(KEYWORD_FIELD), + VERIFY_MATCH_ALL_DOCS, + null, // without type hint + DEFAULT_POST_COLLECTION ); - testSearchCase(query, dataset, aggregation -> aggregation.field(KEYWORD_FIELD), agg -> { - assertEquals(9, agg.getBuckets().size()); - for (int i = 0; i < 9; i++) { - StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(i); - assertThat(bucket.getKey(), equalTo(String.valueOf(9L - i))); - assertThat(bucket.getDocCount(), equalTo(9L - i)); - } - }, - ValueType.STRING // with type hint + testSearchCase( + ADD_SORTED_FIELD_NO_STORE, + MATCH_ALL_DOCS_QUERY, + dataset, + aggregation -> aggregation.field(KEYWORD_FIELD), + VERIFY_MATCH_ALL_DOCS, + ValueType.STRING, // with type hint + DEFAULT_POST_COLLECTION + ); + } + + public void testMatchAllDocsWithStoredValues() throws IOException { + // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used, + // therefore using NOOP_POST_COLLECTION + // This also verifies that the bucket count is completed without running postCollection() + + testSearchCase( + ADD_SORTED_FIELD_STORE, + MATCH_ALL_DOCS_QUERY, + dataset, + aggregation -> aggregation.field(KEYWORD_FIELD), + VERIFY_MATCH_ALL_DOCS, + null, // without type hint + NOOP_POST_COLLECTION + ); + + testSearchCase( + ADD_SORTED_FIELD_STORE, + MATCH_ALL_DOCS_QUERY, + dataset, + aggregation -> aggregation.field(KEYWORD_FIELD), + VERIFY_MATCH_ALL_DOCS, + ValueType.STRING, // with type hint + NOOP_POST_COLLECTION ); } private void testSearchCase( + TriConsumer addField, Query query, List dataset, Consumer configure, Consumer verify, - ValueType valueType + ValueType valueType, + Consumer postCollectionConsumer ) throws IOException { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); for (String value : dataset) { + addField.apply(document, KEYWORD_FIELD, value); document.add(new SortedSetDocValuesField(KEYWORD_FIELD, new BytesRef(value))); indexWriter.addDocument(document); document.clear(); @@ -147,5 +191,4 @@ private void testSearchCase( } } } - } diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java index 91fc3b54faa3f..93939657b6981 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java @@ -144,18 +144,6 @@ public class TermsAggregatorTests extends AggregatorTestCase { private static final String STRING_SCRIPT_NAME = "string_script"; private static final String STRING_SCRIPT_OUTPUT = "Orange"; - private static final Consumer DEFAULT_POST_COLLECTION = termsAggregator -> { - try { - termsAggregator.postCollection(); - } catch (IOException e) { - throw new RuntimeException(e); - } - }; - - // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used. - // using NOOP_POST_COLLECTION_CONSUMER ensures that the bucket count in aggregation is completed before/without running postCollection() - private static final Consumer NOOP_POST_COLLECTION = termsAggregator -> {}; - @Override protected MapperService mapperServiceMock() { MapperService mapperService = mock(MapperService.class); @@ -275,10 +263,7 @@ public void testUsesGlobalOrdinalsByDefault() throws Exception { * In this case, the segment terms will not get initialized and will run without LeafBucketCollector#termDocFreqCollector optimization */ public void testSimpleAggregation() throws Exception { - testSimple( - (document, field, value) -> document.add(new SortedSetDocValuesField(field, new BytesRef(value))), - DEFAULT_POST_COLLECTION - ); + testSimple(ADD_SORTED_FIELD_NO_STORE, DEFAULT_POST_COLLECTION); } /** @@ -286,20 +271,17 @@ public void testSimpleAggregation() throws Exception { * In this case, the segment terms will get initialized and will use LeafBucketCollector#termDocFreqCollector optimization */ public void testSimpleAggregationWithStoredValues() throws Exception { - // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used. + // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used, + // therefore using NOOP_POST_COLLECTION // This also verifies that the bucket count is completed without running postCollection() - testSimple((document, field, value) -> { - document.add(new SortedSetDocValuesField(field, new BytesRef(value))); - document.add(new StringField(field, value, Field.Store.NO)); - }, NOOP_POST_COLLECTION); - + testSimple(ADD_SORTED_FIELD_STORE, NOOP_POST_COLLECTION); } /** * This is a utility method to test out string terms aggregation * @param addFieldConsumer a function that determines how a field is added to the document */ - private void testSimple(TriConsumer addFieldConsumer, Consumer postCollectionConsumer) + private void testSimple(TriConsumer addFieldConsumer, Consumer postCollectionConsumer) throws Exception { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { @@ -374,7 +356,7 @@ public void testStringIncludeExcludeWithStoredValues() throws Exception { }, NOOP_POST_COLLECTION); } - private void testStringIncludeExclude(TriConsumer addField, Consumer postCollectionConsumer) + private void testStringIncludeExclude(TriConsumer addField, Consumer postCollectionConsumer) throws Exception { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { diff --git a/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java b/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java index ac0447dbebf7e..ec928e1122bb3 100644 --- a/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java +++ b/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java @@ -34,11 +34,13 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.InetAddressPoint; import org.apache.lucene.document.LatLonDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; import org.apache.lucene.index.CompositeReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -62,6 +64,7 @@ import org.opensearch.Version; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.CheckedConsumer; +import org.opensearch.common.TriConsumer; import org.opensearch.common.TriFunction; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; @@ -121,6 +124,7 @@ import org.opensearch.search.aggregations.AggregatorFactories.Builder; import org.opensearch.search.aggregations.MultiBucketConsumerService.MultiBucketConsumer; import org.opensearch.search.aggregations.bucket.nested.NestedAggregationBuilder; +import org.opensearch.search.aggregations.bucket.terms.TermsAggregator; import org.opensearch.search.aggregations.metrics.MetricsAggregator; import org.opensearch.search.aggregations.pipeline.PipelineAggregator; import org.opensearch.search.aggregations.pipeline.PipelineAggregator.PipelineTree; @@ -178,6 +182,26 @@ public abstract class AggregatorTestCase extends OpenSearchTestCase { // A list of field types that should not be tested, or are not currently supported private static List TYPE_TEST_DENYLIST; + protected static final Consumer DEFAULT_POST_COLLECTION = termsAggregator -> { + try { + termsAggregator.postCollection(); + } catch (IOException e) { + throw new RuntimeException(e); + } + }; + + // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used. + // using NOOP_POST_COLLECTION_CONSUMER ensures that the bucket count in aggregation is completed before/without running postCollection() + protected static final Consumer NOOP_POST_COLLECTION = termsAggregator -> {}; + + protected static final TriConsumer ADD_SORTED_FIELD_NO_STORE = (document, field, value) -> + document.add(new SortedSetDocValuesField(field, new BytesRef(value))); + + protected static final TriConsumer ADD_SORTED_FIELD_STORE = (document, field, value) -> { + document.add(new SortedSetDocValuesField(field, new BytesRef(value))); + document.add(new StringField(field, value, Field.Store.NO)); + }; + static { List denylist = new ArrayList<>(); denylist.add(ObjectMapper.CONTENT_TYPE); // Cannot aggregate objects @@ -484,6 +508,16 @@ protected A searchAndReduc return searchAndReduce(createIndexSettings(), searcher, query, builder, DEFAULT_MAX_BUCKETS, fieldTypes); } + protected A searchAndReduce( + IndexSearcher searcher, + Query query, + AggregationBuilder builder, + Consumer postCollectionConsumer, + MappedFieldType... fieldTypes + ) throws IOException { + return searchAndReduce(createIndexSettings(), searcher, query, builder, DEFAULT_MAX_BUCKETS, postCollectionConsumer, fieldTypes); + } + protected A searchAndReduce( IndexSettings indexSettings, IndexSearcher searcher, @@ -504,6 +538,17 @@ protected A searchAndReduc return searchAndReduce(createIndexSettings(), searcher, query, builder, maxBucket, fieldTypes); } + protected A searchAndReduce( + IndexSettings indexSettings, + IndexSearcher searcher, + Query query, + AggregationBuilder builder, + int maxBucket, + MappedFieldType... fieldTypes + ) throws IOException { + return searchAndReduce(indexSettings, searcher, query, builder, maxBucket, DEFAULT_POST_COLLECTION, fieldTypes); + } + /** * Collects all documents that match the provided query {@link Query} and * returns the reduced {@link InternalAggregation}. @@ -518,6 +563,7 @@ protected A searchAndReduc Query query, AggregationBuilder builder, int maxBucket, + Consumer postCollectionConsumer, MappedFieldType... fieldTypes ) throws IOException { final IndexReaderContext ctx = searcher.getTopReaderContext(); @@ -548,13 +594,13 @@ protected A searchAndReduc a.preCollection(); Weight weight = subSearcher.createWeight(rewritten, ScoreMode.COMPLETE, 1f); subSearcher.search(weight, a); - a.postCollection(); + postCollectionConsumer.accept(a); aggs.add(a.buildTopLevel()); } } else { root.preCollection(); searcher.search(rewritten, root); - root.postCollection(); + postCollectionConsumer.accept(root); aggs.add(root.buildTopLevel()); } From 3c98cb2ebdf2589e9ad9d0dc5a31b52bf7a21bfc Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Mon, 26 Feb 2024 16:05:42 -0800 Subject: [PATCH 26/30] Spotless in framework Signed-off-by: Sandesh Kumar --- .../opensearch/search/aggregations/AggregatorTestCase.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java b/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java index ec928e1122bb3..9fdae80bd1ada 100644 --- a/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java +++ b/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java @@ -124,7 +124,6 @@ import org.opensearch.search.aggregations.AggregatorFactories.Builder; import org.opensearch.search.aggregations.MultiBucketConsumerService.MultiBucketConsumer; import org.opensearch.search.aggregations.bucket.nested.NestedAggregationBuilder; -import org.opensearch.search.aggregations.bucket.terms.TermsAggregator; import org.opensearch.search.aggregations.metrics.MetricsAggregator; import org.opensearch.search.aggregations.pipeline.PipelineAggregator; import org.opensearch.search.aggregations.pipeline.PipelineAggregator.PipelineTree; @@ -194,8 +193,9 @@ public abstract class AggregatorTestCase extends OpenSearchTestCase { // using NOOP_POST_COLLECTION_CONSUMER ensures that the bucket count in aggregation is completed before/without running postCollection() protected static final Consumer NOOP_POST_COLLECTION = termsAggregator -> {}; - protected static final TriConsumer ADD_SORTED_FIELD_NO_STORE = (document, field, value) -> - document.add(new SortedSetDocValuesField(field, new BytesRef(value))); + protected static final TriConsumer ADD_SORTED_FIELD_NO_STORE = (document, field, value) -> document.add( + new SortedSetDocValuesField(field, new BytesRef(value)) + ); protected static final TriConsumer ADD_SORTED_FIELD_STORE = (document, field, value) -> { document.add(new SortedSetDocValuesField(field, new BytesRef(value))); From 9be19a02f980dd0583e0be250f84e03696d1570d Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Fri, 8 Mar 2024 15:48:01 -0800 Subject: [PATCH 27/30] Test cases improvement Signed-off-by: Sandesh Kumar --- .../GlobalOrdinalsStringTermsAggregator.java | 10 +- .../terms/KeywordTermsAggregatorTests.java | 75 ++---- .../bucket/terms/TermsAggregatorTests.java | 213 ++++++++++-------- .../aggregations/AggregatorTestCase.java | 131 +++++++---- 4 files changed, 243 insertions(+), 186 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 2a83710117e74..15e538f01e632 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -40,6 +40,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.CollectionTerminatedException; import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -162,7 +163,7 @@ public void setWeight(Weight weight) { @param ctx The LeafReaderContext to collect terms from @param globalOrds The SortedSetDocValues for the field's ordinals @param ordCountConsumer A consumer to accept collected term frequencies - @return A no-operation LeafBucketCollector implementation, since collection is complete + @return A LeafBucketCollector implementation with collection termination, since collection is complete @throws IOException If an I/O error occurs during reading */ LeafBucketCollector termDocFreqCollector( @@ -217,7 +218,12 @@ LeafBucketCollector termDocFreqCollector( ordinalTerm = globalOrdinalTermsEnum.next(); } } - return LeafBucketCollector.NO_OP_COLLECTOR; + return new LeafBucketCollector() { + @Override + public void collect(int doc, long owningBucketOrd) throws IOException { + throw new CollectionTerminatedException(); + } + }; } @Override diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java index 5d1e02116f189..753644dce81d5 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java @@ -32,7 +32,6 @@ package org.opensearch.search.aggregations.bucket.terms; import org.apache.lucene.document.Document; -import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.IndexSearcher; @@ -41,11 +40,9 @@ import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; -import org.apache.lucene.util.BytesRef; import org.opensearch.common.TriConsumer; import org.opensearch.index.mapper.KeywordFieldMapper; import org.opensearch.index.mapper.MappedFieldType; -import org.opensearch.search.aggregations.Aggregator; import org.opensearch.search.aggregations.AggregatorTestCase; import org.opensearch.search.aggregations.support.ValueType; @@ -59,6 +56,8 @@ public class KeywordTermsAggregatorTests extends AggregatorTestCase { private static final String KEYWORD_FIELD = "keyword"; + private static final Consumer CONFIGURE_KEYWORD_FIELD = agg -> agg.field(KEYWORD_FIELD); + private static final List dataset; static { List d = new ArrayList<>(45); @@ -70,7 +69,7 @@ public class KeywordTermsAggregatorTests extends AggregatorTestCase { dataset = d; } - private static Consumer VERIFY_MATCH_ALL_DOCS = agg -> { + private static final Consumer VERIFY_MATCH_ALL_DOCS = agg -> { assertEquals(9, agg.getBuckets().size()); for (int i = 0; i < 9; i++) { StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(i); @@ -79,77 +78,49 @@ public class KeywordTermsAggregatorTests extends AggregatorTestCase { } }; - private static Query MATCH_ALL_DOCS_QUERY = new MatchAllDocsQuery(); + private static final Consumer VERIFY_MATCH_NO_DOCS = agg -> { assertEquals(0, agg.getBuckets().size()); }; + + private static final Query MATCH_ALL_DOCS_QUERY = new MatchAllDocsQuery(); - private static Query MATCH_NO_DOCS_QUERY = new MatchNoDocsQuery(); + private static final Query MATCH_NO_DOCS_QUERY = new MatchNoDocsQuery(); public void testMatchNoDocs() throws IOException { testSearchCase( - ADD_SORTED_FIELD_NO_STORE, + ADD_SORTED_SET_FIELD_NOT_INDEXED, MATCH_NO_DOCS_QUERY, dataset, - aggregation -> aggregation.field(KEYWORD_FIELD), - agg -> assertEquals(0, agg.getBuckets().size()), - null, // without type hint - DEFAULT_POST_COLLECTION + CONFIGURE_KEYWORD_FIELD, + VERIFY_MATCH_NO_DOCS, + null // without type hint ); testSearchCase( - ADD_SORTED_FIELD_NO_STORE, + ADD_SORTED_SET_FIELD_NOT_INDEXED, MATCH_NO_DOCS_QUERY, dataset, - aggregation -> aggregation.field(KEYWORD_FIELD), - agg -> assertEquals(0, agg.getBuckets().size()), - ValueType.STRING, // with type hint - DEFAULT_POST_COLLECTION + CONFIGURE_KEYWORD_FIELD, + VERIFY_MATCH_NO_DOCS, + ValueType.STRING // with type hint ); } public void testMatchAllDocs() throws IOException { testSearchCase( - ADD_SORTED_FIELD_NO_STORE, - MATCH_ALL_DOCS_QUERY, - dataset, - aggregation -> aggregation.field(KEYWORD_FIELD), - VERIFY_MATCH_ALL_DOCS, - null, // without type hint - DEFAULT_POST_COLLECTION - ); - - testSearchCase( - ADD_SORTED_FIELD_NO_STORE, - MATCH_ALL_DOCS_QUERY, - dataset, - aggregation -> aggregation.field(KEYWORD_FIELD), - VERIFY_MATCH_ALL_DOCS, - ValueType.STRING, // with type hint - DEFAULT_POST_COLLECTION - ); - } - - public void testMatchAllDocsWithStoredValues() throws IOException { - // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used, - // therefore using NOOP_POST_COLLECTION - // This also verifies that the bucket count is completed without running postCollection() - - testSearchCase( - ADD_SORTED_FIELD_STORE, + ADD_SORTED_SET_FIELD_NOT_INDEXED, MATCH_ALL_DOCS_QUERY, dataset, - aggregation -> aggregation.field(KEYWORD_FIELD), + CONFIGURE_KEYWORD_FIELD, VERIFY_MATCH_ALL_DOCS, - null, // without type hint - NOOP_POST_COLLECTION + null // without type hint ); testSearchCase( - ADD_SORTED_FIELD_STORE, + ADD_SORTED_SET_FIELD_NOT_INDEXED, MATCH_ALL_DOCS_QUERY, dataset, - aggregation -> aggregation.field(KEYWORD_FIELD), + CONFIGURE_KEYWORD_FIELD, VERIFY_MATCH_ALL_DOCS, - ValueType.STRING, // with type hint - NOOP_POST_COLLECTION + ValueType.STRING // with type hint ); } @@ -159,15 +130,13 @@ private void testSearchCase( List dataset, Consumer configure, Consumer verify, - ValueType valueType, - Consumer postCollectionConsumer + ValueType valueType ) throws IOException { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); for (String value : dataset) { addField.apply(document, KEYWORD_FIELD, value); - document.add(new SortedSetDocValuesField(KEYWORD_FIELD, new BytesRef(value))); indexWriter.addDocument(document); document.clear(); } diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java index 93939657b6981..b8a08068f76a3 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java @@ -44,6 +44,7 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; @@ -121,6 +122,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.function.Function; @@ -137,9 +139,6 @@ import static org.mockito.Mockito.when; public class TermsAggregatorTests extends AggregatorTestCase { - - private boolean randomizeAggregatorImpl = true; - // Constants for a script that returns a string private static final String STRING_SCRIPT_NAME = "string_script"; private static final String STRING_SCRIPT_OUTPUT = "Orange"; @@ -172,9 +171,22 @@ protected ScriptService getMockScriptService() { return new ScriptService(Settings.EMPTY, engines, ScriptModule.CORE_CONTEXTS); } + protected CountingAggregator createCountingAggregator( + AggregationBuilder aggregationBuilder, + IndexSearcher indexSearcher, + boolean randomizeAggregatorImpl, + MappedFieldType... fieldTypes + ) throws IOException { + return new CountingAggregator( + new AtomicInteger(), + createAggregator(aggregationBuilder, indexSearcher, randomizeAggregatorImpl, fieldTypes) + ); + } + protected A createAggregator( AggregationBuilder aggregationBuilder, IndexSearcher indexSearcher, + boolean randomizeAggregatorImpl, MappedFieldType... fieldTypes ) throws IOException { try { @@ -189,6 +201,14 @@ protected A createAggregator( } } + protected A createAggregator( + AggregationBuilder aggregationBuilder, + IndexSearcher indexSearcher, + MappedFieldType... fieldTypes + ) throws IOException { + return createAggregator(aggregationBuilder, indexSearcher, true, fieldTypes); + } + @Override protected AggregationBuilder createAggBuilderForTypeTest(MappedFieldType fieldType, String fieldName) { return new TermsAggregationBuilder("foo").field(fieldName); @@ -208,8 +228,6 @@ protected List getSupportedValuesSourceTypes() { } public void testUsesGlobalOrdinalsByDefault() throws Exception { - randomizeAggregatorImpl = false; - Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); indexWriter.close(); @@ -221,7 +239,7 @@ public void testUsesGlobalOrdinalsByDefault() throws Exception { .field("string"); MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType("string"); - TermsAggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); + TermsAggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, false, fieldType); assertThat(aggregator, instanceOf(GlobalOrdinalsStringTermsAggregator.class)); GlobalOrdinalsStringTermsAggregator globalAgg = (GlobalOrdinalsStringTermsAggregator) aggregator; assertThat(globalAgg.descriptCollectionStrategy(), equalTo("dense")); @@ -259,30 +277,55 @@ public void testUsesGlobalOrdinalsByDefault() throws Exception { } /** - * This test case utilizes the low cardinality implementation of GlobalOrdinalsStringTermsAggregator. - * In this case, the segment terms will not get initialized and will run without LeafBucketCollector#termDocFreqCollector optimization + * This test case utilizes the default implementation of GlobalOrdinalsStringTermsAggregator. */ public void testSimpleAggregation() throws Exception { - testSimple(ADD_SORTED_FIELD_NO_STORE, DEFAULT_POST_COLLECTION); + // Fields not indexed: cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited + testSimple(ADD_SORTED_SET_FIELD_NOT_INDEXED, false, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + + // Fields indexed, deleted documents in segment: cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited + testSimple(ADD_SORTED_SET_FIELD_INDEXED, true, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + + // Fields indexed, no deleted documents in segment: will use LeafBucketCollector#termDocFreqCollector - no documents are visited + testSimple(ADD_SORTED_SET_FIELD_INDEXED, false, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 0); + } + + /** + * This test case utilizes the LowCardinality implementation of GlobalOrdinalsStringTermsAggregator. + */ + public void testSimpleAggregationLowCardinality() throws Exception { + // Fields not indexed: cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited + testSimple(ADD_SORTED_SET_FIELD_NOT_INDEXED, false, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + + // Fields indexed, deleted documents in segment: cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited + testSimple(ADD_SORTED_SET_FIELD_INDEXED, true, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + + // Fields indexed, no deleted documents in segment: will use LeafBucketCollector#termDocFreqCollector - no documents are visited + testSimple(ADD_SORTED_SET_FIELD_INDEXED, false, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 0); } /** - * This test case utilizes the low cardinality implementation of GlobalOrdinalsStringTermsAggregator. - * In this case, the segment terms will get initialized and will use LeafBucketCollector#termDocFreqCollector optimization + * This test case utilizes the MapStringTermsAggregator. */ - public void testSimpleAggregationWithStoredValues() throws Exception { - // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used, - // therefore using NOOP_POST_COLLECTION - // This also verifies that the bucket count is completed without running postCollection() - testSimple(ADD_SORTED_FIELD_STORE, NOOP_POST_COLLECTION); + public void testSimpleMapStringAggregation() throws Exception { + testSimple(ADD_SORTED_SET_FIELD_INDEXED, randomBoolean(), randomBoolean(), TermsAggregatorFactory.ExecutionMode.MAP, 4); } /** * This is a utility method to test out string terms aggregation * @param addFieldConsumer a function that determines how a field is added to the document + * @param includeDeletedDocumentsInSegment to include deleted documents in the segment or not + * @param collectSegmentOrds collect segment ords or not - set true to utilize LowCardinality implementation for GlobalOrdinalsStringTermsAggregator + * @param executionMode execution mode MAP or GLOBAL_ORDINALS + * @param expectedCollectCount expected number of documents visited as part of collect() invocation */ - private void testSimple(TriConsumer addFieldConsumer, Consumer postCollectionConsumer) - throws Exception { + private void testSimple( + TriConsumer addFieldConsumer, + final boolean includeDeletedDocumentsInSegment, + boolean collectSegmentOrds, + TermsAggregatorFactory.ExecutionMode executionMode, + final int expectedCollectCount + ) throws Exception { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); @@ -301,94 +344,84 @@ private void testSimple(TriConsumer addFieldConsumer, document = new Document(); addFieldConsumer.apply(document, "string", ""); indexWriter.addDocument(document); + + if (includeDeletedDocumentsInSegment) { + document = new Document(); + ADD_SORTED_SET_FIELD_INDEXED.apply(document, "string", "e"); + indexWriter.addDocument(document); + indexWriter.deleteDocuments(new Term("string", "e")); + assertEquals(5, indexWriter.getDocStats().maxDoc); + } + assertEquals(4, indexWriter.getDocStats().numDocs); + try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { IndexSearcher indexSearcher = newIndexSearcher(indexReader); - for (TermsAggregatorFactory.ExecutionMode executionMode : TermsAggregatorFactory.ExecutionMode.values()) { - TermsAggregationBuilder aggregationBuilder = new TermsAggregationBuilder("_name").userValueTypeHint( - ValueType.STRING - ).executionHint(executionMode.toString()).field("string").order(BucketOrder.key(true)); - MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType("string"); - TermsAggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); - aggregator.preCollection(); - indexSearcher.search(new MatchAllDocsQuery(), aggregator); - postCollectionConsumer.accept(aggregator); - Terms result = reduce(aggregator); - assertEquals(5, result.getBuckets().size()); - assertEquals("", result.getBuckets().get(0).getKeyAsString()); - assertEquals(2L, result.getBuckets().get(0).getDocCount()); - assertEquals("a", result.getBuckets().get(1).getKeyAsString()); - assertEquals(2L, result.getBuckets().get(1).getDocCount()); - assertEquals("b", result.getBuckets().get(2).getKeyAsString()); - assertEquals(2L, result.getBuckets().get(2).getDocCount()); - assertEquals("c", result.getBuckets().get(3).getKeyAsString()); - assertEquals(1L, result.getBuckets().get(3).getDocCount()); - assertEquals("d", result.getBuckets().get(4).getKeyAsString()); - assertEquals(1L, result.getBuckets().get(4).getDocCount()); - assertTrue(AggregationInspectionHelper.hasValue((InternalTerms) result)); - } + TermsAggregationBuilder aggregationBuilder = new TermsAggregationBuilder("_name").userValueTypeHint(ValueType.STRING) + .executionHint(executionMode.toString()) + .field("string") + .order(BucketOrder.key(true)); + MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType("string"); + + TermsAggregatorFactory.COLLECT_SEGMENT_ORDS = collectSegmentOrds; + TermsAggregatorFactory.REMAP_GLOBAL_ORDS = false; + CountingAggregator aggregator = createCountingAggregator(aggregationBuilder, indexSearcher, false, fieldType); + + aggregator.preCollection(); + indexSearcher.search(new MatchAllDocsQuery(), aggregator); + aggregator.postCollection(); + Terms result = reduce(aggregator); + assertEquals(5, result.getBuckets().size()); + assertEquals("", result.getBuckets().get(0).getKeyAsString()); + assertEquals(2L, result.getBuckets().get(0).getDocCount()); + assertEquals("a", result.getBuckets().get(1).getKeyAsString()); + assertEquals(2L, result.getBuckets().get(1).getDocCount()); + assertEquals("b", result.getBuckets().get(2).getKeyAsString()); + assertEquals(2L, result.getBuckets().get(2).getDocCount()); + assertEquals("c", result.getBuckets().get(3).getKeyAsString()); + assertEquals(1L, result.getBuckets().get(3).getDocCount()); + assertEquals("d", result.getBuckets().get(4).getKeyAsString()); + assertEquals(1L, result.getBuckets().get(4).getDocCount()); + assertTrue(AggregationInspectionHelper.hasValue((InternalTerms) result)); + + assertEquals(expectedCollectCount, aggregator.getCollectCount().get()); } } } } - /** - * This test case utilizes the default implementation of GlobalOrdinalsStringTermsAggregator. - * In this case, the segment terms will not get initialized and will run without LeafBucketCollector#termDocFreqCollector optimization - */ public void testStringIncludeExclude() throws Exception { - testStringIncludeExclude( - (document, field, value) -> document.add(new SortedSetDocValuesField(field, new BytesRef(value))), - DEFAULT_POST_COLLECTION - ); - } - - /** - * This test case utilizes the default implementation of GlobalOrdinalsStringTermsAggregator. - * In this case, the segment terms will get initialized and will use LeafBucketCollector#termDocFreqCollector optimization - */ - public void testStringIncludeExcludeWithStoredValues() throws Exception { - // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used - // This also verifies that the bucket count is completed without running postCollection() - testStringIncludeExclude((document, field, value) -> { - document.add(new SortedSetDocValuesField(field, new BytesRef(value))); - document.add(new StringField(field, value, Field.Store.NO)); - }, NOOP_POST_COLLECTION); - } - - private void testStringIncludeExclude(TriConsumer addField, Consumer postCollectionConsumer) - throws Exception { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { Document document = new Document(); - addField.apply(document, "mv_field", "val000"); - addField.apply(document, "mv_field", "val001"); - addField.apply(document, "sv_field", "val001"); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val000"))); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val001"))); + document.add(new SortedDocValuesField("sv_field", new BytesRef("val001"))); indexWriter.addDocument(document); document = new Document(); - addField.apply(document, "mv_field", "val002"); - addField.apply(document, "mv_field", "val003"); - addField.apply(document, "sv_field", "val003"); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val002"))); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val003"))); + document.add(new SortedDocValuesField("sv_field", new BytesRef("val003"))); indexWriter.addDocument(document); document = new Document(); - addField.apply(document, "mv_field", "val004"); - addField.apply(document, "mv_field", "val005"); - addField.apply(document, "sv_field", "val005"); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val004"))); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val005"))); + document.add(new SortedDocValuesField("sv_field", new BytesRef("val005"))); indexWriter.addDocument(document); document = new Document(); - addField.apply(document, "mv_field", "val006"); - addField.apply(document, "mv_field", "val007"); - addField.apply(document, "sv_field", "val007"); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val006"))); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val007"))); + document.add(new SortedDocValuesField("sv_field", new BytesRef("val007"))); indexWriter.addDocument(document); document = new Document(); - addField.apply(document, "mv_field", "val008"); - addField.apply(document, "mv_field", "val009"); - addField.apply(document, "sv_field", "val009"); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val008"))); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val009"))); + document.add(new SortedDocValuesField("sv_field", new BytesRef("val009"))); indexWriter.addDocument(document); document = new Document(); - addField.apply(document, "mv_field", "val010"); - addField.apply(document, "mv_field", "val011"); - addField.apply(document, "sv_field", "val011"); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val010"))); + document.add(new SortedSetDocValuesField("mv_field", new BytesRef("val011"))); + document.add(new SortedDocValuesField("sv_field", new BytesRef("val011"))); indexWriter.addDocument(document); try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { IndexSearcher indexSearcher = newIndexSearcher(indexReader); @@ -405,7 +438,7 @@ private void testStringIncludeExclude(TriConsumer addF TermsAggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - postCollectionConsumer.accept(aggregator); + aggregator.postCollection(); Terms result = reduce(aggregator); assertEquals(10, result.getBuckets().size()); assertEquals("val000", result.getBuckets().get(0).getKeyAsString()); @@ -440,7 +473,7 @@ private void testStringIncludeExclude(TriConsumer addF aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType2); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - postCollectionConsumer.accept(aggregator); + aggregator.postCollection(); result = reduce(aggregator); assertEquals(5, result.getBuckets().size()); assertEquals("val001", result.getBuckets().get(0).getKeyAsString()); @@ -464,7 +497,7 @@ private void testStringIncludeExclude(TriConsumer addF aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - postCollectionConsumer.accept(aggregator); + aggregator.postCollection(); result = reduce(aggregator); assertEquals(8, result.getBuckets().size()); assertEquals("val002", result.getBuckets().get(0).getKeyAsString()); @@ -493,7 +526,7 @@ private void testStringIncludeExclude(TriConsumer addF aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - postCollectionConsumer.accept(aggregator); + aggregator.postCollection(); result = reduce(aggregator); assertEquals(2, result.getBuckets().size()); assertEquals("val010", result.getBuckets().get(0).getKeyAsString()); @@ -510,7 +543,7 @@ private void testStringIncludeExclude(TriConsumer addF aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - postCollectionConsumer.accept(aggregator); + aggregator.postCollection(); result = reduce(aggregator); assertEquals(2, result.getBuckets().size()); assertEquals("val000", result.getBuckets().get(0).getKeyAsString()); @@ -542,7 +575,7 @@ private void testStringIncludeExclude(TriConsumer addF aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); - postCollectionConsumer.accept(aggregator); + aggregator.postCollection(); result = reduce(aggregator); assertEquals(2, result.getBuckets().size()); assertEquals("val000", result.getBuckets().get(0).getKeyAsString()); diff --git a/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java b/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java index 9fdae80bd1ada..4eb49ebb42241 100644 --- a/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java +++ b/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java @@ -124,6 +124,7 @@ import org.opensearch.search.aggregations.AggregatorFactories.Builder; import org.opensearch.search.aggregations.MultiBucketConsumerService.MultiBucketConsumer; import org.opensearch.search.aggregations.bucket.nested.NestedAggregationBuilder; +import org.opensearch.search.aggregations.bucket.terms.TermsAggregator; import org.opensearch.search.aggregations.metrics.MetricsAggregator; import org.opensearch.search.aggregations.pipeline.PipelineAggregator; import org.opensearch.search.aggregations.pipeline.PipelineAggregator.PipelineTree; @@ -150,6 +151,7 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; @@ -181,23 +183,10 @@ public abstract class AggregatorTestCase extends OpenSearchTestCase { // A list of field types that should not be tested, or are not currently supported private static List TYPE_TEST_DENYLIST; - protected static final Consumer DEFAULT_POST_COLLECTION = termsAggregator -> { - try { - termsAggregator.postCollection(); - } catch (IOException e) { - throw new RuntimeException(e); - } - }; - - // aggregator.postCollection() is not required when LeafBucketCollector#termDocFreqCollector optimization is used. - // using NOOP_POST_COLLECTION_CONSUMER ensures that the bucket count in aggregation is completed before/without running postCollection() - protected static final Consumer NOOP_POST_COLLECTION = termsAggregator -> {}; + protected static final TriConsumer ADD_SORTED_SET_FIELD_NOT_INDEXED = (document, field, value) -> document + .add(new SortedSetDocValuesField(field, new BytesRef(value))); - protected static final TriConsumer ADD_SORTED_FIELD_NO_STORE = (document, field, value) -> document.add( - new SortedSetDocValuesField(field, new BytesRef(value)) - ); - - protected static final TriConsumer ADD_SORTED_FIELD_STORE = (document, field, value) -> { + protected static final TriConsumer ADD_SORTED_SET_FIELD_INDEXED = (document, field, value) -> { document.add(new SortedSetDocValuesField(field, new BytesRef(value))); document.add(new StringField(field, value, Field.Store.NO)); }; @@ -457,7 +446,6 @@ protected QueryShardContext queryShardContextMock( CircuitBreakerService circuitBreakerService, BigArrays bigArrays ) { - return new QueryShardContext( 0, indexSettings, @@ -508,16 +496,6 @@ protected A searchAndReduc return searchAndReduce(createIndexSettings(), searcher, query, builder, DEFAULT_MAX_BUCKETS, fieldTypes); } - protected A searchAndReduce( - IndexSearcher searcher, - Query query, - AggregationBuilder builder, - Consumer postCollectionConsumer, - MappedFieldType... fieldTypes - ) throws IOException { - return searchAndReduce(createIndexSettings(), searcher, query, builder, DEFAULT_MAX_BUCKETS, postCollectionConsumer, fieldTypes); - } - protected A searchAndReduce( IndexSettings indexSettings, IndexSearcher searcher, @@ -538,17 +516,6 @@ protected A searchAndReduc return searchAndReduce(createIndexSettings(), searcher, query, builder, maxBucket, fieldTypes); } - protected A searchAndReduce( - IndexSettings indexSettings, - IndexSearcher searcher, - Query query, - AggregationBuilder builder, - int maxBucket, - MappedFieldType... fieldTypes - ) throws IOException { - return searchAndReduce(indexSettings, searcher, query, builder, maxBucket, DEFAULT_POST_COLLECTION, fieldTypes); - } - /** * Collects all documents that match the provided query {@link Query} and * returns the reduced {@link InternalAggregation}. @@ -563,7 +530,6 @@ protected A searchAndReduc Query query, AggregationBuilder builder, int maxBucket, - Consumer postCollectionConsumer, MappedFieldType... fieldTypes ) throws IOException { final IndexReaderContext ctx = searcher.getTopReaderContext(); @@ -594,13 +560,13 @@ protected A searchAndReduc a.preCollection(); Weight weight = subSearcher.createWeight(rewritten, ScoreMode.COMPLETE, 1f); subSearcher.search(weight, a); - postCollectionConsumer.accept(a); + a.postCollection(); aggs.add(a.buildTopLevel()); } } else { root.preCollection(); searcher.search(rewritten, root); - postCollectionConsumer.accept(root); + root.postCollection(); aggs.add(root.buildTopLevel()); } @@ -1142,6 +1108,89 @@ protected void doWriteTo(StreamOutput out) throws IOException { } } + /** + * Wrapper around Aggregator class + * Maintains a count for times collect() is invoked - number of documents visited + */ + protected static class CountingAggregator extends Aggregator { + private final AtomicInteger collectCounter; + public final Aggregator delegate; + + public CountingAggregator(AtomicInteger collectCounter, TermsAggregator delegate) { + this.collectCounter = collectCounter; + this.delegate = delegate; + } + + public AtomicInteger getCollectCount() { + return collectCounter; + } + + @Override + public void close() { + delegate.close(); + } + + @Override + public String name() { + return delegate.name(); + } + + @Override + public SearchContext context() { + return delegate.context(); + } + + @Override + public Aggregator parent() { + return delegate.parent(); + } + + @Override + public Aggregator subAggregator(String name) { + return delegate.subAggregator(name); + } + + @Override + public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException { + return delegate.buildAggregations(owningBucketOrds); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return delegate.buildEmptyAggregation(); + } + + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws IOException { + return new LeafBucketCollector() { + @Override + public void collect(int doc, long bucket) throws IOException { + delegate.getLeafCollector(ctx).collect(doc, bucket); + collectCounter.incrementAndGet(); + } + }; + } + + @Override + public ScoreMode scoreMode() { + return delegate.scoreMode(); + } + + @Override + public void preCollection() throws IOException { + delegate.preCollection(); + } + + @Override + public void postCollection() throws IOException { + delegate.postCollection(); + } + + public void setWeight(Weight weight) { + this.delegate.setWeight(weight); + } + } + public static class InternalAggCardinality extends InternalAggregation { private final CardinalityUpperBound cardinality; From 128db478a4a8b5b15febc60214fbd4b2a79c1525 Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Fri, 8 Mar 2024 16:32:12 -0800 Subject: [PATCH 28/30] test typos Signed-off-by: Sandesh Kumar --- .../bucket/terms/TermsAggregatorTests.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java index b8a08068f76a3..8272c31e95e54 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java @@ -228,6 +228,7 @@ protected List getSupportedValuesSourceTypes() { } public void testUsesGlobalOrdinalsByDefault() throws Exception { + boolean randomizeAggregatorImpl = false; Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); indexWriter.close(); @@ -239,35 +240,35 @@ public void testUsesGlobalOrdinalsByDefault() throws Exception { .field("string"); MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType("string"); - TermsAggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, false, fieldType); + TermsAggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, randomizeAggregatorImpl, fieldType); assertThat(aggregator, instanceOf(GlobalOrdinalsStringTermsAggregator.class)); GlobalOrdinalsStringTermsAggregator globalAgg = (GlobalOrdinalsStringTermsAggregator) aggregator; assertThat(globalAgg.descriptCollectionStrategy(), equalTo("dense")); // Infers depth_first because the maxOrd is 0 which is less than the size aggregationBuilder.subAggregation(AggregationBuilders.cardinality("card").field("string")); - aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); + aggregator = createAggregator(aggregationBuilder, indexSearcher, randomizeAggregatorImpl, fieldType); assertThat(aggregator, instanceOf(GlobalOrdinalsStringTermsAggregator.class)); globalAgg = (GlobalOrdinalsStringTermsAggregator) aggregator; assertThat(globalAgg.collectMode, equalTo(Aggregator.SubAggCollectionMode.DEPTH_FIRST)); assertThat(globalAgg.descriptCollectionStrategy(), equalTo("remap")); aggregationBuilder.collectMode(Aggregator.SubAggCollectionMode.DEPTH_FIRST); - aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); + aggregator = createAggregator(aggregationBuilder, indexSearcher, randomizeAggregatorImpl, fieldType); assertThat(aggregator, instanceOf(GlobalOrdinalsStringTermsAggregator.class)); globalAgg = (GlobalOrdinalsStringTermsAggregator) aggregator; assertThat(globalAgg.collectMode, equalTo(Aggregator.SubAggCollectionMode.DEPTH_FIRST)); assertThat(globalAgg.descriptCollectionStrategy(), equalTo("remap")); aggregationBuilder.collectMode(Aggregator.SubAggCollectionMode.BREADTH_FIRST); - aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); + aggregator = createAggregator(aggregationBuilder, indexSearcher, randomizeAggregatorImpl, fieldType); assertThat(aggregator, instanceOf(GlobalOrdinalsStringTermsAggregator.class)); globalAgg = (GlobalOrdinalsStringTermsAggregator) aggregator; assertThat(globalAgg.collectMode, equalTo(Aggregator.SubAggCollectionMode.BREADTH_FIRST)); assertThat(globalAgg.descriptCollectionStrategy(), equalTo("dense")); aggregationBuilder.order(BucketOrder.aggregation("card", true)); - aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType); + aggregator = createAggregator(aggregationBuilder, indexSearcher, randomizeAggregatorImpl, fieldType); assertThat(aggregator, instanceOf(GlobalOrdinalsStringTermsAggregator.class)); globalAgg = (GlobalOrdinalsStringTermsAggregator) aggregator; assertThat(globalAgg.descriptCollectionStrategy(), equalTo("remap")); @@ -350,9 +351,8 @@ private void testSimple( ADD_SORTED_SET_FIELD_INDEXED.apply(document, "string", "e"); indexWriter.addDocument(document); indexWriter.deleteDocuments(new Term("string", "e")); - assertEquals(5, indexWriter.getDocStats().maxDoc); + assertEquals(5, indexWriter.getDocStats().maxDoc); // deleted document still in segment } - assertEquals(4, indexWriter.getDocStats().numDocs); try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { IndexSearcher indexSearcher = newIndexSearcher(indexReader); From 7954045710ccd1ef8a831b271ad0dd66d3643dce Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Mon, 11 Mar 2024 00:35:06 -0700 Subject: [PATCH 29/30] Add test case for _doc_count Signed-off-by: Sandesh Kumar --- .../bucket/terms/TermsAggregatorTests.java | 45 ++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java index 8272c31e95e54..cfb04d2aa1d19 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/TermsAggregatorTests.java @@ -278,38 +278,54 @@ public void testUsesGlobalOrdinalsByDefault() throws Exception { } /** - * This test case utilizes the default implementation of GlobalOrdinalsStringTermsAggregator. + * This test case utilizes the default implementation of GlobalOrdinalsStringTermsAggregator since collectSegmentOrds is false */ public void testSimpleAggregation() throws Exception { // Fields not indexed: cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited - testSimple(ADD_SORTED_SET_FIELD_NOT_INDEXED, false, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + testSimple(ADD_SORTED_SET_FIELD_NOT_INDEXED, false, false, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); // Fields indexed, deleted documents in segment: cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited - testSimple(ADD_SORTED_SET_FIELD_INDEXED, true, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + testSimple(ADD_SORTED_SET_FIELD_INDEXED, true, false, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); // Fields indexed, no deleted documents in segment: will use LeafBucketCollector#termDocFreqCollector - no documents are visited - testSimple(ADD_SORTED_SET_FIELD_INDEXED, false, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 0); + testSimple(ADD_SORTED_SET_FIELD_INDEXED, false, false, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 0); + + // Fields indexed, no deleted documents, but _doc_field value present in document: + // cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited + testSimple(ADD_SORTED_SET_FIELD_INDEXED, false, true, false, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + } /** - * This test case utilizes the LowCardinality implementation of GlobalOrdinalsStringTermsAggregator. + * This test case utilizes the LowCardinality implementation of GlobalOrdinalsStringTermsAggregator since collectSegmentOrds is true */ public void testSimpleAggregationLowCardinality() throws Exception { // Fields not indexed: cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited - testSimple(ADD_SORTED_SET_FIELD_NOT_INDEXED, false, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + testSimple(ADD_SORTED_SET_FIELD_NOT_INDEXED, false, false, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); // Fields indexed, deleted documents in segment: cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited - testSimple(ADD_SORTED_SET_FIELD_INDEXED, true, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); + testSimple(ADD_SORTED_SET_FIELD_INDEXED, true, false, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); // Fields indexed, no deleted documents in segment: will use LeafBucketCollector#termDocFreqCollector - no documents are visited - testSimple(ADD_SORTED_SET_FIELD_INDEXED, false, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 0); + testSimple(ADD_SORTED_SET_FIELD_INDEXED, false, false, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 0); + + // Fields indexed, no deleted documents, but _doc_field value present in document: + // cannot use LeafBucketCollector#termDocFreqCollector - all documents are visited + testSimple(ADD_SORTED_SET_FIELD_INDEXED, false, true, true, TermsAggregatorFactory.ExecutionMode.GLOBAL_ORDINALS, 4); } /** * This test case utilizes the MapStringTermsAggregator. */ public void testSimpleMapStringAggregation() throws Exception { - testSimple(ADD_SORTED_SET_FIELD_INDEXED, randomBoolean(), randomBoolean(), TermsAggregatorFactory.ExecutionMode.MAP, 4); + testSimple( + ADD_SORTED_SET_FIELD_INDEXED, + randomBoolean(), + randomBoolean(), + randomBoolean(), + TermsAggregatorFactory.ExecutionMode.MAP, + 4 + ); } /** @@ -323,6 +339,7 @@ public void testSimpleMapStringAggregation() throws Exception { private void testSimple( TriConsumer addFieldConsumer, final boolean includeDeletedDocumentsInSegment, + final boolean includeDocCountField, boolean collectSegmentOrds, TermsAggregatorFactory.ExecutionMode executionMode, final int expectedCollectCount @@ -344,6 +361,10 @@ private void testSimple( indexWriter.addDocument(document); document = new Document(); addFieldConsumer.apply(document, "string", ""); + if (includeDocCountField) { + // Adding _doc_count to one document + document.add(new NumericDocValuesField("_doc_count", 10)); + } indexWriter.addDocument(document); if (includeDeletedDocumentsInSegment) { @@ -373,7 +394,11 @@ private void testSimple( Terms result = reduce(aggregator); assertEquals(5, result.getBuckets().size()); assertEquals("", result.getBuckets().get(0).getKeyAsString()); - assertEquals(2L, result.getBuckets().get(0).getDocCount()); + if (includeDocCountField) { + assertEquals(11L, result.getBuckets().get(0).getDocCount()); + } else { + assertEquals(2L, result.getBuckets().get(0).getDocCount()); + } assertEquals("a", result.getBuckets().get(1).getKeyAsString()); assertEquals(2L, result.getBuckets().get(1).getDocCount()); assertEquals("b", result.getBuckets().get(2).getKeyAsString()); From cdc42044c7be5c727b8eb012ff92e51749e6bd4d Mon Sep 17 00:00:00 2001 From: Sandesh Kumar Date: Mon, 11 Mar 2024 17:08:10 -0700 Subject: [PATCH 30/30] Rewording for changelog and minor documentation changes Signed-off-by: Sandesh Kumar --- CHANGELOG.md | 2 +- .../bucket/terms/GlobalOrdinalsStringTermsAggregator.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2be1d8eda34f6..7f0118711126e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -128,7 +128,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - Allow composite aggregation to run under a parent filter aggregation ([#11499](https://github.com/opensearch-project/OpenSearch/pull/11499)) -- Improve string terms aggregation performance using Collector#setWeight ([#11643](https://github.com/opensearch-project/OpenSearch/pull/11643)) +- Quickly compute terms aggregations when the top-level query is functionally match-all for a segment ([#11643](https://github.com/opensearch-project/OpenSearch/pull/11643)) ### Deprecated diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 15e538f01e632..69fda2f3f6133 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -159,7 +159,7 @@ public void setWeight(Weight weight) { } /** - Collects term frequencies for a given field from a LeafReaderContext directly from stored segment terms + Read doc frequencies directly from indexed terms in the segment to skip iterating through individual documents @param ctx The LeafReaderContext to collect terms from @param globalOrds The SortedSetDocValues for the field's ordinals @param ordCountConsumer A consumer to accept collected term frequencies