Skip to content

Commit

Permalink
Modify KeywordField behavior
Browse files Browse the repository at this point in the history
1. Use Lucene's KeywordField instead of custom Field type.
2. Use IndexOrDocValuesQuery for all MultiTermQueries.

Signed-off-by: Michael Froh <froh@amazon.com>
  • Loading branch information
msfroh committed Aug 8, 2023
1 parent 2c33669 commit abaa86f
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KeywordField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
Expand All @@ -29,10 +30,10 @@
import org.opensearch.common.collect.Iterators;
import org.opensearch.common.lucene.Lucene;
import org.opensearch.common.lucene.search.AutomatonQueries;
import org.opensearch.common.xcontent.JsonToStringXContentParser;
import org.opensearch.core.xcontent.DeprecationHandler;
import org.opensearch.core.xcontent.NamedXContentRegistry;
import org.opensearch.core.xcontent.XContentParser;
import org.opensearch.common.xcontent.JsonToStringXContentParser;
import org.opensearch.index.analysis.NamedAnalyzer;
import org.opensearch.index.fielddata.IndexFieldData;
import org.opensearch.index.fielddata.plain.SortedSetOrdinalsIndexFieldData;
Expand Down Expand Up @@ -116,7 +117,7 @@ private FlatObjectFieldType buildFlatObjectFieldType(BuilderContext context, Fie

/**
* ValueFieldMapper is the subfield type for values in the Json.
* use a {@link KeywordFieldMapper.KeywordField}
* use a {@link KeywordField}
*/
private ValueFieldMapper buildValueFieldMapper(BuilderContext context, FieldType fieldType, FlatObjectFieldType fft) {
String fullName = buildFullName(context);
Expand All @@ -129,7 +130,7 @@ private ValueFieldMapper buildValueFieldMapper(BuilderContext context, FieldType

/**
* ValueAndPathFieldMapper is the subfield type for path=value format in the Json.
* also use a {@link KeywordFieldMapper.KeywordField}
* also use a {@link KeywordField}
*/
private ValueAndPathFieldMapper buildValueAndPathFieldMapper(BuilderContext context, FieldType fieldType, FlatObjectFieldType fft) {
String fullName = buildFullName(context);
Expand Down Expand Up @@ -686,7 +687,7 @@ protected ValueAndPathFieldMapper(FieldType fieldType, KeywordFieldMapper.Keywor
void addField(ParseContext context, String value) {
final BytesRef binaryValue = new BytesRef(value);
if (fieldType.indexOptions() != IndexOptions.NONE || fieldType.stored()) {
Field field = new KeywordFieldMapper.KeywordField(fieldType().name(), binaryValue, fieldType);
Field field = new KeywordField(fieldType().name(), binaryValue, fieldType.stored() ? Field.Store.YES : Field.Store.NO);

context.doc().add(field);

Expand Down Expand Up @@ -727,7 +728,7 @@ protected ValueFieldMapper(FieldType fieldType, KeywordFieldMapper.KeywordFieldT
void addField(ParseContext context, String value) {
final BytesRef binaryValue = new BytesRef(value);
if (fieldType.indexOptions() != IndexOptions.NONE || fieldType.stored()) {
Field field = new KeywordFieldMapper.KeywordField(fieldType().name(), binaryValue, fieldType);
Field field = new KeywordField(fieldType().name(), binaryValue, fieldType.stored() ? Field.Store.YES : Field.Store.NO);
context.doc().add(field);

if (fieldType().hasDocValues() == false && fieldType.omitNorms()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,13 @@
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KeywordField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.Nullable;
import org.opensearch.common.lucene.Lucene;
Expand Down Expand Up @@ -87,19 +90,6 @@ public static class Defaults {
}
}

/**
* The keyword field for the field mapper
*
* @opensearch.internal
*/
public static class KeywordField extends Field {

public KeywordField(String field, BytesRef term, FieldType ft) {
super(field, term, ft);
}

}

private static KeywordFieldMapper toType(FieldMapper in) {
return (KeywordFieldMapper) in;
}
Expand Down Expand Up @@ -258,7 +248,7 @@ public KeywordFieldType(String name, FieldType fieldType, NamedAnalyzer normaliz
name,
fieldType.indexOptions() != IndexOptions.NONE,
fieldType.stored(),
builder.hasDocValues.getValue(),
true, // Always has doc values
new TextSearchInfo(fieldType, builder.similarity.getValue(), searchAnalyzer, searchAnalyzer),
builder.meta.getValue()
);
Expand Down Expand Up @@ -383,6 +373,71 @@ public Query wildcardQuery(
// query text
return super.wildcardQuery(value, method, caseInsensitve, true, context);
}

@Override
public Query termsQuery(List<?> values, QueryShardContext context) {
failIfNotIndexed();
BytesRef[] bytesRefs = new BytesRef[values.size()];
for (int i = 0; i < bytesRefs.length; i++) {
bytesRefs[i] = indexedValueForSearch(values.get(i));
}
return KeywordField.newSetQuery(name(), bytesRefs);
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
Query indexQuery = super.prefixQuery(value, method, caseInsensitive, context);
Query dvQuery = super.prefixQuery(value, MultiTermQuery.DOC_VALUES_REWRITE, caseInsensitive, context);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}

@Override
public Query normalizedWildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
Query indexQuery = super.normalizedWildcardQuery(value, method, context);
Query dvQuery = super.normalizedWildcardQuery(value, MultiTermQuery.DOC_VALUES_REWRITE, context);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}

@Override
public Query regexpQuery(
String value,
int syntaxFlags,
int matchFlags,
int maxDeterminizedStates,
MultiTermQuery.RewriteMethod method,
QueryShardContext context
) {
Query indexQuery = super.regexpQuery(value, syntaxFlags, matchFlags, maxDeterminizedStates, method, context);
Query dvQuery = super.regexpQuery(
value,
syntaxFlags,
matchFlags,
maxDeterminizedStates,
MultiTermQuery.DOC_VALUES_REWRITE,
context
);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}

@Override
public Query rangeQuery(Object lowerTerm, Object upperTerm, boolean includeLower, boolean includeUpper, QueryShardContext context) {
Query indexQuery = new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
Query dvQuery = new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper,
MultiTermQuery.DOC_VALUES_REWRITE
);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}
}

private final boolean indexed;
Expand Down Expand Up @@ -464,7 +519,7 @@ protected void parseCreateField(ParseContext context) throws IOException {
// convert to utf8 only once before feeding postings/dv/stored fields
final BytesRef binaryValue = new BytesRef(value);
if (fieldType.indexOptions() != IndexOptions.NONE || fieldType.stored()) {
Field field = new KeywordField(fieldType().name(), binaryValue, fieldType);
Field field = new KeywordField(fieldType().name(), binaryValue, fieldType.stored() ? Field.Store.YES : Field.Store.NO);
context.doc().add(field);

if (fieldType().hasDocValues() == false && fieldType.omitNorms()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

package org.opensearch.search.suggest.completion.context;

import org.apache.lucene.document.KeywordField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StoredField;
Expand All @@ -41,7 +42,6 @@
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.core.xcontent.XContentParser;
import org.opensearch.core.xcontent.XContentParser.Token;
import org.opensearch.index.mapper.KeywordFieldMapper;
import org.opensearch.index.mapper.ParseContext;
import org.opensearch.index.mapper.ParseContext.Document;

Expand Down Expand Up @@ -156,7 +156,7 @@ public Set<String> parseContext(Document document) {
for (IndexableField field : fields) {
if (field instanceof SortedDocValuesField || field instanceof SortedSetDocValuesField || field instanceof StoredField) {
// Ignore doc values and stored fields
} else if (field instanceof KeywordFieldMapper.KeywordField) {
} else if (field instanceof KeywordField) {
values.add(field.binaryValue().utf8ToString());
} else if (field.stringValue() != null) {
values.add(field.stringValue());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

package org.opensearch.index.mapper;

import org.apache.lucene.document.KeywordField;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
Expand All @@ -16,10 +17,10 @@
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.Strings;
import org.opensearch.core.xcontent.ToXContent;
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.common.xcontent.XContentFactory;
import org.opensearch.common.xcontent.json.JsonXContent;
import org.opensearch.core.xcontent.ToXContent;
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.index.query.QueryShardContext;

import java.io.IOException;
Expand Down Expand Up @@ -119,12 +120,12 @@ public void testDefaults() throws Exception {
// Test internal substring fields as well
IndexableField[] fieldValues = doc.rootDoc().getFields("field" + VALUE_SUFFIX);
assertEquals(2, fieldValues.length);
assertTrue(fieldValues[0] instanceof KeywordFieldMapper.KeywordField);
assertTrue(fieldValues[0] instanceof KeywordField);
assertEquals(new BytesRef("bar"), fieldValues[0].binaryValue());

IndexableField[] fieldValueAndPaths = doc.rootDoc().getFields("field" + VALUE_AND_PATH_SUFFIX);
assertEquals(2, fieldValues.length);
assertTrue(fieldValueAndPaths[0] instanceof KeywordFieldMapper.KeywordField);
assertTrue(fieldValueAndPaths[0] instanceof KeywordField);
assertEquals(new BytesRef("field.foo=bar"), fieldValueAndPaths[0].binaryValue());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
package org.opensearch.search.suggest.completion;

import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.KeywordField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StoredField;
Expand Down Expand Up @@ -809,7 +809,7 @@ public void testParsingContextFromDocument() throws Exception {
ParseContext.Document document = new ParseContext.Document();

KeywordFieldMapper.KeywordFieldType keyword = new KeywordFieldMapper.KeywordFieldType("category");
document.add(new KeywordFieldMapper.KeywordField(keyword.name(), new BytesRef("category1"), new FieldType()));
document.add(new KeywordField(keyword.name(), new BytesRef("category1"), Field.Store.NO));
// Ignore doc values
document.add(new SortedSetDocValuesField(keyword.name(), new BytesRef("category1")));
Set<String> context = mapping.parseContext(document);
Expand Down

2 comments on commit abaa86f

@harshavamsi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@msfroh I had a couple of questions regarding your implementation. While I'm going to ignore the changes made to FlatObjectField for now, I see that you're setting doc_values to be true always for the keyword field. I see in the docs that it is an optional field that defaults to false. Keyword does support both index and doc right?

I understand that you're overriding the various query methods in the mapper, but in the base class I also see other query methods that could be overridden as well -- https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java#L221-L372

Do you think we'd need to override all of them to use the index vs doc functionality?

@msfroh
Copy link
Owner Author

@msfroh msfroh commented on abaa86f Oct 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right the current gaps are:

  1. We probably shouldn't use the new Lucene KeywordField type, just use the existing field type which may or may not have doc values enabled (but does by default -- I think we need to open a documentation issue for that).
  2. We should check if the field has doc values enabled and if the field is indexed. Right now, those query implementations throw an exception if the field is not indexed.
    1. If both are false, we should throw an exception.
    2. If the field is indexed, but doesn't have docvalues, we should create the same query we do now.
    3. If the field has docvalues, but is not indexed, we should return just the query with the doc values rewrite.
    4. If the field has both, we should return an IndexOrDocValues query as I did in this commit. That's the "You know what, Lucene, you decide (per-segment) how to run this query".
  3. We should do the above for anything that ends up returning an implementation of Lucene's MultiTermQuery.

Please sign in to comment.