From 704e42771c13577b7a5494be2026be38150e588f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Vl=C4=8Dek?= Date: Fri, 16 Feb 2024 11:49:37 +0100 Subject: [PATCH 1/2] Document Node Analysis Components API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This functionality was introduced in PR https://github.com/opensearch-project/OpenSearch/pull/10296 Signed-off-by: Lukáš Vlček --- _analyzers/index.md | 313 +++++++++++++++++++++++- _api-reference/nodes-apis/nodes-info.md | 2 + 2 files changed, 314 insertions(+), 1 deletion(-) diff --git a/_analyzers/index.md b/_analyzers/index.md index 95f97ec8ce..860a1559ae 100644 --- a/_analyzers/index.md +++ b/_analyzers/index.md @@ -64,6 +64,317 @@ Analyzer | Analysis performed | Analyzer output If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer. +With the introduction of OpenSearch `v2.12.1`, you can retrieve a comprehensive list of all available text analysis components by using [Nodes Info]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-info/). This can be helpful when building custom analyzers, especially in cases where you need to recall the component's name or identify the analysis plugin to which the component belongs. + +Introduced 2.12.1 +{: .label .label-purple } + +```json +GET /_nodes/analysis_components?pretty=true&filter_path=nodes.*.analysis_components +``` +{% include copy-curl.html %} + +The following is an example response from a node that includes a `common-analysis` module (a module that is present by default): + +
+ + Response + + {: .text-delta} + +```json +{ + "nodes" : { + "cZidmv5kQbWQN8M8dz9f5g" : { + "analysis_components" : { + "analyzers" : [ + "arabic", + "armenian", + "basque", + "bengali", + "brazilian", + "bulgarian", + "catalan", + "chinese", + "cjk", + "czech", + "danish", + "default", + "dutch", + "english", + "estonian", + "fingerprint", + "finnish", + "french", + "galician", + "german", + "greek", + "hindi", + "hungarian", + "indonesian", + "irish", + "italian", + "keyword", + "latvian", + "lithuanian", + "norwegian", + "pattern", + "persian", + "portuguese", + "romanian", + "russian", + "simple", + "snowball", + "sorani", + "spanish", + "standard", + "stop", + "swedish", + "thai", + "turkish", + "whitespace" + ], + "tokenizers" : [ + "PathHierarchy", + "char_group", + "classic", + "edgeNGram", + "edge_ngram", + "keyword", + "letter", + "lowercase", + "nGram", + "ngram", + "path_hierarchy", + "pattern", + "simple_pattern", + "simple_pattern_split", + "standard", + "thai", + "uax_url_email", + "whitespace" + ], + "tokenFilters" : [ + "apostrophe", + "arabic_normalization", + "arabic_stem", + "asciifolding", + "bengali_normalization", + "brazilian_stem", + "cjk_bigram", + "cjk_width", + "classic", + "common_grams", + "concatenate_graph", + "condition", + "czech_stem", + "decimal_digit", + "delimited_payload", + "delimited_term_freq", + "dictionary_decompounder", + "dutch_stem", + "edgeNGram", + "edge_ngram", + "elision", + "fingerprint", + "flatten_graph", + "french_stem", + "german_normalization", + "german_stem", + "hindi_normalization", + "hunspell", + "hyphenation_decompounder", + "indic_normalization", + "keep", + "keep_types", + "keyword_marker", + "kstem", + "length", + "limit", + "lowercase", + "min_hash", + "multiplexer", + "nGram", + "ngram", + "pattern_capture", + "pattern_replace", + "persian_normalization", + "porter_stem", + "predicate_token_filter", + "remove_duplicates", + "reverse", + "russian_stem", + "scandinavian_folding", + "scandinavian_normalization", + "serbian_normalization", + "shingle", + "snowball", + "sorani_normalization", + "standard", + "stemmer", + "stemmer_override", + "stop", + "synonym", + "synonym_graph", + "trim", + "truncate", + "unique", + "uppercase", + "word_delimiter", + "word_delimiter_graph" + ], + "charFilters" : [ + "html_strip", + "mapping", + "pattern_replace" + ], + "normalizers" : [ + "lowercase" + ], + "plugins" : [ + { + "name" : "analysis-common", + "classname" : "org.opensearch.analysis.common.CommonAnalysisModulePlugin", + "analyzers" : [ + "arabic", + "armenian", + "basque", + "bengali", + "brazilian", + "bulgarian", + "catalan", + "chinese", + "cjk", + "czech", + "danish", + "dutch", + "english", + "estonian", + "fingerprint", + "finnish", + "french", + "galician", + "german", + "greek", + "hindi", + "hungarian", + "indonesian", + "irish", + "italian", + "latvian", + "lithuanian", + "norwegian", + "pattern", + "persian", + "portuguese", + "romanian", + "russian", + "snowball", + "sorani", + "spanish", + "swedish", + "thai", + "turkish" + ], + "tokenizers" : [ + "PathHierarchy", + "char_group", + "classic", + "edgeNGram", + "edge_ngram", + "keyword", + "letter", + "lowercase", + "nGram", + "ngram", + "path_hierarchy", + "pattern", + "simple_pattern", + "simple_pattern_split", + "thai", + "uax_url_email", + "whitespace" + ], + "tokenFilters" : [ + "apostrophe", + "arabic_normalization", + "arabic_stem", + "asciifolding", + "bengali_normalization", + "brazilian_stem", + "cjk_bigram", + "cjk_width", + "classic", + "common_grams", + "concatenate_graph", + "condition", + "czech_stem", + "decimal_digit", + "delimited_payload", + "delimited_term_freq", + "dictionary_decompounder", + "dutch_stem", + "edgeNGram", + "edge_ngram", + "elision", + "fingerprint", + "flatten_graph", + "french_stem", + "german_normalization", + "german_stem", + "hindi_normalization", + "hyphenation_decompounder", + "indic_normalization", + "keep", + "keep_types", + "keyword_marker", + "kstem", + "length", + "limit", + "lowercase", + "min_hash", + "multiplexer", + "nGram", + "ngram", + "pattern_capture", + "pattern_replace", + "persian_normalization", + "porter_stem", + "predicate_token_filter", + "remove_duplicates", + "reverse", + "russian_stem", + "scandinavian_folding", + "scandinavian_normalization", + "serbian_normalization", + "snowball", + "sorani_normalization", + "stemmer", + "stemmer_override", + "synonym", + "synonym_graph", + "trim", + "truncate", + "unique", + "uppercase", + "word_delimiter", + "word_delimiter_graph" + ], + "charFilters" : [ + "html_strip", + "mapping", + "pattern_replace" + ], + "hunspellDictionaries" : [ ] + } + ] + } + } + } +} +``` +
+ ## Text analysis at indexing time and query time OpenSearch performs text analysis on text fields when you index a document and when you send a search request. Depending on the time of text analysis, the analyzers used for it are classified as follows: @@ -172,4 +483,4 @@ The response provides information about the analyzers for each field: ## Next steps -- Learn more about specifying [index analyzers]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) and [search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). \ No newline at end of file +- Learn more about specifying [index analyzers]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) and [search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). diff --git a/_api-reference/nodes-apis/nodes-info.md b/_api-reference/nodes-apis/nodes-info.md index d7c810410e..9d2a21469d 100644 --- a/_api-reference/nodes-apis/nodes-info.md +++ b/_api-reference/nodes-apis/nodes-info.md @@ -69,6 +69,7 @@ plugins | Information about installed plugins and modules. ingest | Information about ingest pipelines and available ingest processors. aggregations | Information about available [aggregations]({{site.url}}{{site.baseurl}}/opensearch/aggregations). indices | Static index settings configured at the node level. +analysis_components | Information about available [text analysis]({{site.url}}{{site.baseurl}}/analyzers/) components. ## Query parameters @@ -162,6 +163,7 @@ plugins | Information about the installed plugins, including name, version, Open modules | Information about the modules, including name, version, OpenSearch version, Java version, description, class name, custom folder name, a list of extended plugins, and `has_native_controller`, which specifies whether the plugin has a native controller process. Modules are different from plugins because modules are loaded into OpenSearch automatically, while plugins have to be installed manually. ingest | Information about ingest pipelines and processors. aggregations | Information about the available aggregation types. +analysis_components | Information about available [text analysis]({{site.url}}{{site.baseurl}}/analyzers/) components. ## Required permissions From 5f1da625a3d16b8bc42aec07a918f5c5f28c3d6e Mon Sep 17 00:00:00 2001 From: Naarcha-AWS <97990722+Naarcha-AWS@users.noreply.github.com> Date: Wed, 24 Apr 2024 11:09:16 -0500 Subject: [PATCH 2/2] Update index.md --- _analyzers/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/index.md b/_analyzers/index.md index 860a1559ae..83365af88e 100644 --- a/_analyzers/index.md +++ b/_analyzers/index.md @@ -66,7 +66,7 @@ If needed, you can combine tokenizers, token filters, and character filters to c With the introduction of OpenSearch `v2.12.1`, you can retrieve a comprehensive list of all available text analysis components by using [Nodes Info]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-info/). This can be helpful when building custom analyzers, especially in cases where you need to recall the component's name or identify the analysis plugin to which the component belongs. -Introduced 2.12.1 +Introduced 2.14 {: .label .label-purple } ```json