Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Lucene compliant regex filter expression #675

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* adds `desired_cluster_status` option to opensearch output to signal healthy cluster status
* initially run health checks on setup for every configured component
* make `imagePullPolicy` configurable for helm chart deployments
* it is now possible to use Lucene compliant Filter Expressions


### Improvements
Expand Down
269 changes: 269 additions & 0 deletions doc/source/development/notebooks/processor_examples/regex.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Concatenator\n",
"\n",
"This presentations goal it to introduce the features of the `Concatenator` and how to configure it."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### The challenge\n",
"\n",
"I want to merge different fields from an event in one target field."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"from this:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"document = {\n",
" 'data_stream': {\n",
" 'dataset': 'windows', \n",
" 'namespace': 'devopslab', \n",
" 'type': 'logs'\n",
" }, \n",
" '_op_type': 'create'\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"to this:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"expected = {\n",
" 'data_stream': {\n",
" 'dataset': 'windows', \n",
" 'namespace': 'devopslab', \n",
" 'type': 'logs'\n",
" }, \n",
" '_op_type': 'create', \n",
" '_index': 'logs-windows-devopslab'\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create rule and processor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"create the rule:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"250"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sys\n",
"sys.path.insert(0,\"../../../../../\")\n",
"import tempfile\n",
"from pathlib import Path\n",
"\n",
"#filter: 'ip_address: \"192\\.168\\.0\\..*\"'\n",
"\n",
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \".*logs.*\"' \n",
"regex_fields:\n",
" - \"data_stream.type\"\n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
" - data_stream.dataset\n",
" - data_stream.namespace\n",
" target_field: _index\n",
" separator: \"-\"\n",
" overwrite_target: false\n",
" delete_source_fields: false\n",
"\"\"\"\n",
"\n",
"rule_yaml2 = \"\"\"---\n",
"filter: 'data_stream.type: \"/logs/\"' \n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
" - data_stream.dataset\n",
" - data_stream.namespace\n",
" target_field: _index\n",
" separator: \"-\"\n",
" overwrite_target: false\n",
" delete_source_fields: false\n",
"\"\"\"\n",
"\n",
"\n",
"rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
"rule_path.mkdir(exist_ok=True)\n",
"rule_file = rule_path / \"data-stream.yml\"\n",
"rule_file.write_text(rule_yaml2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"create the processor config:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"processor_config = {\n",
" \"myconcatenator\":{ \n",
" \"type\": \"concatenator\",\n",
" \"specific_rules\": [str(rule_path)],\n",
" \"generic_rules\": [\"/dev\"],\n",
" }\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"create the processor with the factory:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hallo\n"
]
},
{
"data": {
"text/plain": [
"concatenator"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from unittest import mock\n",
"from logprep.factory import Factory\n",
"\n",
"mock_logger = mock.MagicMock()\n",
"concatenator = Factory.create(processor_config)\n",
"concatenator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Process event"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"False\n"
]
}
],
"source": [
"from copy import deepcopy\n",
"mydocument = deepcopy(document)\n",
"\n",
"\n",
"print(f\"before: {mydocument}\")\n",
"concatenator.process(mydocument)\n",
"print(f\"after: {mydocument}\")\n",
"print(mydocument == expected)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
},
"vscode": {
"interpreter": {
"hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
18 changes: 17 additions & 1 deletion logprep/filter/expression/filter_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from abc import ABC, abstractmethod
from itertools import chain, zip_longest
from typing import List, Any
from typing import Any, List


class FilterExpressionError(BaseException):
Expand Down Expand Up @@ -318,6 +318,22 @@ def does_match(self, document: dict) -> bool:
return self._lower_bound <= value <= self._upper_bound


class LuceneRegexExpression(KeyValueBasedFilterExpression):
"""Lucene compliant filter expression that matches a value using regex."""

def __init__(self, key: List[str], regex: str):
self._regex = regex
self._matcher = re.compile(self._regex)
super().__init__(key, f"/{self._regex.strip('^$')}/")

def does_match(self, document: dict) -> bool:
value = self._get_value(self.key, document)

if isinstance(value, list):
return any(filter(self._matcher.match, value))
return self._matcher.match(str(value)) is not None


class RegExFilterExpression(KeyValueBasedFilterExpression):
"""Filter expression that matches a value using regex."""

Expand Down
Loading
Loading