From 76d167893a1babf0eef48804996b82ab1898b501 Mon Sep 17 00:00:00 2001 From: djkhl Date: Thu, 26 Sep 2024 10:31:16 +0200 Subject: [PATCH 01/13] add changelog entry and some prototypey things that actually do nothing yet --- CHANGELOG.md | 1 + .../filter/expression/filter_expression.py | 18 ++++++++- logprep/filter/lucene_filter.py | 38 ++++++++++++++----- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 642c305b0..55dad53b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ * adds `desired_cluster_status` option to opensearch output to signal healthy cluster status * initially run health checks on setup for every configured component * make `imagePullPolicy` configurable for helm chart deployments +* it is now possible to use Lucene compliant Filter Expressions ### Improvements diff --git a/logprep/filter/expression/filter_expression.py b/logprep/filter/expression/filter_expression.py index 738010374..d3e2bb6c8 100644 --- a/logprep/filter/expression/filter_expression.py +++ b/logprep/filter/expression/filter_expression.py @@ -3,7 +3,7 @@ import re from abc import ABC, abstractmethod from itertools import chain, zip_longest -from typing import List, Any +from typing import Any, List class FilterExpressionError(BaseException): @@ -318,6 +318,22 @@ def does_match(self, document: dict) -> bool: return self._lower_bound <= value <= self._upper_bound +class LuceneRegexExpression(KeyValueBasedFilterExpression): + """Lucene compliant filter expression that matches a value using regex.""" + + def __init__(self, key: List[str], regex: str): + self._regex = regex + self._matcher = re.compile(self._regex) + super().__init__(key, f"/{self._regex.strip('^$')}/") + + def does_match(self, document: dict) -> bool: + value = self._get_value(self.key, document) + + if isinstance(value, list): + return any(filter(self._matcher.match, value)) + return self._matcher.match(str(value)) is not None + + class RegExFilterExpression(KeyValueBasedFilterExpression): """Filter expression that matches a value using regex.""" diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index ee24d8d7e..292ed6eb1 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -84,23 +84,36 @@ from itertools import chain, zip_longest # pylint: enable=anomalous-backslash-in-string -from typing import List, Union, Optional +from typing import List, Optional, Union import luqum -from luqum.parser import parser, ParseSyntaxError, IllegalCharacterError -from luqum.tree import OrOperation, AndOperation, Group, FieldGroup, SearchField, Phrase, Word, Not +from luqum.parser import IllegalCharacterError, ParseSyntaxError, parser +from luqum.tree import ( + AndOperation, + FieldGroup, + Group, + Not, + OrOperation, + Phrase, + Regex, + SearchField, + Word, +) from logprep.filter.expression.filter_expression import ( - Or, + Always, And, - StringFilterExpression, - SigmaFilterExpression, - RegExFilterExpression, - Not as NotExpression, Exists, - Null, - Always, FilterExpression, + LuceneRegexExpression, +) +from logprep.filter.expression.filter_expression import Not as NotExpression +from logprep.filter.expression.filter_expression import ( + Null, + Or, + RegExFilterExpression, + SigmaFilterExpression, + StringFilterExpression, ) @@ -261,8 +274,13 @@ def _parse_tree(self, tree: luqum.tree) -> FilterExpression: if self._last_search_field: return self._create_field_group_expression(tree) return self._create_value_expression(tree) + if isinstance(tree, Regex): + return self._create_regex_expression(tree) raise LuceneFilterError(f'The expression "{str(tree)}" is invalid!') + def _create_regex_expression(self, tree: luqum.tree) -> LuceneRegexExpression: + pass + def _create_field_group_expression(self, tree: luqum.tree) -> FilterExpression: """Creates filter expression that is resulting from a field group. From e149a02d70a9cc85eec5ea50d5e359fd7c0c44da Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Fri, 27 Sep 2024 15:48:31 +0200 Subject: [PATCH 02/13] Adding lucine compliance unit test for development --- tests/unit/filter/test_lucene_filter.py | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/unit/filter/test_lucene_filter.py b/tests/unit/filter/test_lucene_filter.py index d843fb438..a8f593681 100644 --- a/tests/unit/filter/test_lucene_filter.py +++ b/tests/unit/filter/test_lucene_filter.py @@ -451,3 +451,30 @@ def test_create_filter_success(self, testcase, input_str, cleaned_str): def test_create_filter_error(self, testcase, input_str, message): with raises(LuceneFilterError, match=re.escape(message)): LuceneFilter.create(f'foo: "{input_str}"') + + def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self): + lucene_filter = LuceneFilter.create( + 'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"', + special_fields={"regex_fields": ["regex_key_one", "regex_key_two"]}, + ) + + a = And( + RegExFilterExpression(["regex_key_one"], ".*value.*"), + RegExFilterExpression(["regex_key_two"], ".*value.*"), + ) + + assert lucene_filter == And( + RegExFilterExpression(["regex_key_one"], ".*value.*"), + RegExFilterExpression(["regex_key_two"], ".*value.*"), + ) + + def test_creates_lucene_compliance_filter_with_one_matching_and_one_missmatching_regex_key_of_two(self): + lucene_filter = LuceneFilter.create( + 'regex_key_one: ".*value.*" AND key_two: "value"', + special_fields={"regex_fields": ["regex_key_one", "i_dont_exist"]}, + ) + + assert lucene_filter == And( + RegExFilterExpression(["regex_key_one"], ".*value.*"), + StringFilterExpression(["key_two"], "value"), + ) From c7388c58bdd33d14e220e69235771706f3b45bbd Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Wed, 2 Oct 2024 16:14:22 +0200 Subject: [PATCH 03/13] Adding lucene compliance for filter parsing of a rule. --- logprep/filter/lucene_filter.py | 10 ++++++++++ tests/unit/filter/test_lucene_filter.py | 9 +-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index 292ed6eb1..3bd6c96c7 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -235,8 +235,18 @@ def __init__(self, tree: luqum.tree, special_fields: dict = None): for key in self._special_fields_map: self._special_fields[key] = special_fields.get(key) if special_fields.get(key) else [] + if not self._special_fields['regex_fields']: + self.recognize_regex_and_add_special_fields() + self._last_search_field = None + def recognize_regex_and_add_special_fields(self): + for child in self._tree.children: + value = child.children[0].value[1:-1] + if value.startswith('/') and value.endswith('/'): + self._special_fields['regex_fields'].append(child.name) + child.children[0].value = f'"{value[1:-1]}"' + def build_filter(self) -> FilterExpression: """Transform luqum tree into FilterExpression diff --git a/tests/unit/filter/test_lucene_filter.py b/tests/unit/filter/test_lucene_filter.py index a8f593681..1a790e5d8 100644 --- a/tests/unit/filter/test_lucene_filter.py +++ b/tests/unit/filter/test_lucene_filter.py @@ -455,12 +455,6 @@ def test_create_filter_error(self, testcase, input_str, message): def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self): lucene_filter = LuceneFilter.create( 'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"', - special_fields={"regex_fields": ["regex_key_one", "regex_key_two"]}, - ) - - a = And( - RegExFilterExpression(["regex_key_one"], ".*value.*"), - RegExFilterExpression(["regex_key_two"], ".*value.*"), ) assert lucene_filter == And( @@ -470,8 +464,7 @@ def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self): def test_creates_lucene_compliance_filter_with_one_matching_and_one_missmatching_regex_key_of_two(self): lucene_filter = LuceneFilter.create( - 'regex_key_one: ".*value.*" AND key_two: "value"', - special_fields={"regex_fields": ["regex_key_one", "i_dont_exist"]}, + 'regex_key_one: "/.*value.*/" AND key_two: "value"', ) assert lucene_filter == And( From 0b644b7d81ee0c1503a0360afa2160189bb2a2ee Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Mon, 7 Oct 2024 13:10:43 +0200 Subject: [PATCH 04/13] Adding logger with deprecation warning for regex_fields --- logprep/filter/lucene_filter.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index 3bd6c96c7..445f48f16 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -86,6 +86,7 @@ # pylint: enable=anomalous-backslash-in-string from typing import List, Optional, Union +import logging import luqum from luqum.parser import IllegalCharacterError, ParseSyntaxError, parser from luqum.tree import ( @@ -116,6 +117,7 @@ StringFilterExpression, ) +logger = logging.getLogger("LuceneFilter") class LuceneFilterError(BaseException): """Base class for LuceneFilter related exceptions.""" @@ -237,6 +239,9 @@ def __init__(self, tree: luqum.tree, special_fields: dict = None): if not self._special_fields['regex_fields']: self.recognize_regex_and_add_special_fields() + else: + logger.warning("[Deprecation]: special_fields are no longer necessary. " + "Use Lucene regex annotation for filter ") self._last_search_field = None From 2c244520baceeefce4134cbc705848c5f47b3bfb Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Mon, 7 Oct 2024 13:27:11 +0200 Subject: [PATCH 05/13] Add comment and documentation for lucene regex filter annotation --- logprep/filter/lucene_filter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index 445f48f16..fc7363099 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -240,12 +240,15 @@ def __init__(self, tree: luqum.tree, special_fields: dict = None): if not self._special_fields['regex_fields']: self.recognize_regex_and_add_special_fields() else: + # DEPRECATION: regex_fields are no longer necessary. logger.warning("[Deprecation]: special_fields are no longer necessary. " "Use Lucene regex annotation for filter ") self._last_search_field = None def recognize_regex_and_add_special_fields(self): + """ Recognize regex expressions in filter and add those fields to regex_fields. + """ for child in self._tree.children: value = child.children[0].value[1:-1] if value.startswith('/') and value.endswith('/'): From cffe57b09bfb58472128e3a009a46b950512c433 Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Mon, 7 Oct 2024 15:00:15 +0200 Subject: [PATCH 06/13] Quickfix for lucene regex filter --- logprep/filter/lucene_filter.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index fc7363099..144c03410 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -250,10 +250,13 @@ def recognize_regex_and_add_special_fields(self): """ Recognize regex expressions in filter and add those fields to regex_fields. """ for child in self._tree.children: - value = child.children[0].value[1:-1] - if value.startswith('/') and value.endswith('/'): - self._special_fields['regex_fields'].append(child.name) - child.children[0].value = f'"{value[1:-1]}"' + try: + value = child.children[0].value[1:-1] + if value.startswith('/') and value.endswith('/'): + self._special_fields['regex_fields'].append(child.name) + child.children[0].value = f'"{value[1:-1]}"' + except: + pass def build_filter(self) -> FilterExpression: """Transform luqum tree into FilterExpression From 7a2ca1f6d900316bbfd41103e4c4b70aa02861f7 Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Mon, 7 Oct 2024 15:46:09 +0200 Subject: [PATCH 07/13] Adjusting Format --- logprep/filter/lucene_filter.py | 12 ++++++------ tests/unit/filter/test_lucene_filter.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index 144c03410..334dbd49d 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -237,23 +237,23 @@ def __init__(self, tree: luqum.tree, special_fields: dict = None): for key in self._special_fields_map: self._special_fields[key] = special_fields.get(key) if special_fields.get(key) else [] - if not self._special_fields['regex_fields']: + if not self._special_fields["regex_fields"]: self.recognize_regex_and_add_special_fields() else: # DEPRECATION: regex_fields are no longer necessary. logger.warning("[Deprecation]: special_fields are no longer necessary. " - "Use Lucene regex annotation for filter ") + "Use Lucene regex annotation for filter " + ) self._last_search_field = None def recognize_regex_and_add_special_fields(self): - """ Recognize regex expressions in filter and add those fields to regex_fields. - """ + """ Recognize regex expressions in filter and add those fields to regex_fields. """ for child in self._tree.children: try: value = child.children[0].value[1:-1] - if value.startswith('/') and value.endswith('/'): - self._special_fields['regex_fields'].append(child.name) + if value.startswith("/") and value.endswith("/"): + self._special_fields["regex_fields"].append(child.name) child.children[0].value = f'"{value[1:-1]}"' except: pass diff --git a/tests/unit/filter/test_lucene_filter.py b/tests/unit/filter/test_lucene_filter.py index 1a790e5d8..9f74164e4 100644 --- a/tests/unit/filter/test_lucene_filter.py +++ b/tests/unit/filter/test_lucene_filter.py @@ -462,7 +462,7 @@ def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self): RegExFilterExpression(["regex_key_two"], ".*value.*"), ) - def test_creates_lucene_compliance_filter_with_one_matching_and_one_missmatching_regex_key_of_two(self): + def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self): lucene_filter = LuceneFilter.create( 'regex_key_one: "/.*value.*/" AND key_two: "value"', ) From 86cb008cc7384d2fa1ecfb486805b928bacf301f Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Mon, 7 Oct 2024 16:07:51 +0200 Subject: [PATCH 08/13] Adjusting Format 2 --- logprep/filter/lucene_filter.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index 334dbd49d..fa57ccd8e 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -119,6 +119,7 @@ logger = logging.getLogger("LuceneFilter") + class LuceneFilterError(BaseException): """Base class for LuceneFilter related exceptions.""" @@ -241,14 +242,15 @@ def __init__(self, tree: luqum.tree, special_fields: dict = None): self.recognize_regex_and_add_special_fields() else: # DEPRECATION: regex_fields are no longer necessary. - logger.warning("[Deprecation]: special_fields are no longer necessary. " - "Use Lucene regex annotation for filter " + logger.warning( + "[Deprecation]: special_fields are no longer necessary. " + "Use Lucene regex annotation for filter. " ) self._last_search_field = None def recognize_regex_and_add_special_fields(self): - """ Recognize regex expressions in filter and add those fields to regex_fields. """ + """Recognize regex expressions in filter and add those fields to regex_fields. """ for child in self._tree.children: try: value = child.children[0].value[1:-1] From ddf350806ba12d9f336faa882f1b71808f3cc7e7 Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Mon, 7 Oct 2024 16:17:16 +0200 Subject: [PATCH 09/13] Adjusting Format 3 --- logprep/filter/lucene_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index fa57ccd8e..63b32e4b3 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -244,13 +244,13 @@ def __init__(self, tree: luqum.tree, special_fields: dict = None): # DEPRECATION: regex_fields are no longer necessary. logger.warning( "[Deprecation]: special_fields are no longer necessary. " - "Use Lucene regex annotation for filter. " + "Use Lucene regex annotation for filter. " ) self._last_search_field = None def recognize_regex_and_add_special_fields(self): - """Recognize regex expressions in filter and add those fields to regex_fields. """ + """Recognize regex expressions in filter and add those fields to regex_fields.""" for child in self._tree.children: try: value = child.children[0].value[1:-1] From 5a1fe0d10f577d642ed966c72346b130fae31323 Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Wed, 9 Oct 2024 17:17:49 +0200 Subject: [PATCH 10/13] Attempting to remove indeces for regex filter string --- logprep/filter/lucene_filter.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index 63b32e4b3..3805197c4 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -251,13 +251,23 @@ def __init__(self, tree: luqum.tree, special_fields: dict = None): def recognize_regex_and_add_special_fields(self): """Recognize regex expressions in filter and add those fields to regex_fields.""" + # for child in self._tree.children: + # try: + # value = child.children[0].value[1:-1] + # if value.startswith("/") and value.endswith("/"): + # self._special_fields["regex_fields"].append(child.name) + # child.children[0].value = f'"{value[1:-1]}"' + # except: + # pass for child in self._tree.children: try: - value = child.children[0].value[1:-1] - if value.startswith("/") and value.endswith("/"): - self._special_fields["regex_fields"].append(child.name) - child.children[0].value = f'"{value[1:-1]}"' - except: + for sub_child in child.children: + value = getattr(sub_child, 'value')[1:-1] + if value and value.startswith("/") and value.endswith("/"): + self._special_fields["regex_fields"].append(child.name) + sub_child.value = f'"{value.strip("/")}"' + break + except Exception: pass def build_filter(self) -> FilterExpression: From 6ba5b2f3f3a5476eb3cbe5334d16cf1c395371db Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Wed, 9 Oct 2024 17:18:39 +0200 Subject: [PATCH 11/13] Adding notebook for lucene regex filter development --- .../notebooks/processor_examples/regex.ipynb | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 doc/source/development/notebooks/processor_examples/regex.ipynb diff --git a/doc/source/development/notebooks/processor_examples/regex.ipynb b/doc/source/development/notebooks/processor_examples/regex.ipynb new file mode 100644 index 000000000..656a39f13 --- /dev/null +++ b/doc/source/development/notebooks/processor_examples/regex.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../../../../\")\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "\n", + "document = { \n", + " \"regex_key_one\": \"*1value2*\", \n", + " \"regex_key_two\": \"Here is 1234 in the message\" \n", + "}\n", + "\n", + "\n", + "rule_yaml = \"\"\"\n", + "---\n", + "filter: 'regex_key_one: \"/.*value.*/\" AND regex_key_two: \"/.*1234.*/\"'\n", + "regex_fields:\n", + " - \"regex_key_one\"\n", + " - \"regex_key_two\"\n", + "pseudonymizer:\n", + " mapping:\n", + " winlog.event_data.param1: \"RE_WHOLE_FIELD\"\n", + " winlog.event_data.param2: \"RE_WHOLE_FIELD\"\n", + " description: \"...\"\n", + "\"\"\"\n", + "\n", + "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n", + "rule_path.mkdir(exist_ok=True)\n", + "rule_file = rule_path / \"data-stream.yml\"\n", + "rule_file.write_text(rule_yaml)\n", + "\n", + "processor_config = {\n", + " \"mydropper\": {\n", + " \"type\": \"pseudonymizer\",\n", + " \"specific_rules\": [str(rule_path)],\n", + " \"generic_rules\": [\"/dev\"],\n", + " \"outputs\": [{\"kafka\": \"topic\"}],\n", + " \"pubkey_analyst\": \"../../../../../tests/testdata/unit/pseudonymizer/example_analyst_pub.pem\",\n", + " \"pubkey_depseudo\": \"../../../../../tests/testdata/unit/pseudonymizer/example_depseudo_pub.pem\",\n", + " \"hash_salt\": \"a_secret_tasty_ingredient\",\n", + " \"regex_mapping\": \"../../../../../tests/testdata/unit/pseudonymizer/rules/regex_mapping.yml\",\n", + " \"max_cached_pseudonyms\": 1000000\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pseudonymizer" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from unittest import mock\n", + "from logprep.factory import Factory\n", + "\n", + "mock_logger = mock.MagicMock()\n", + "pseudonymizer_processor = Factory.create(processor_config)\n", + "pseudonymizer_processor" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "before: {'regex_key_one': '*1value2*', 'regex_key_two': 'Here is 1234 in the message'}\n", + "after: {'regex_key_one': '*1value2*', 'regex_key_two': 'Here is 1234 in the message'}\n" + ] + } + ], + "source": [ + "from copy import deepcopy\n", + "mydocument = deepcopy(document)\n", + "\n", + "print(f\"before: {mydocument}\")\n", + "pseudonymizer_processor.process(mydocument)\n", + "print(f\"after: {mydocument}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + }, + "vscode": { + "interpreter": { + "hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From bea645c95282a136f176377eb6b97ef58885ca13 Mon Sep 17 00:00:00 2001 From: FabianMoessner Date: Thu, 10 Oct 2024 13:53:08 +0200 Subject: [PATCH 12/13] WIP notebook for lucene regex filter development --- .../notebooks/processor_examples/regex.ipynb | 66 +++++++++++++++---- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/doc/source/development/notebooks/processor_examples/regex.ipynb b/doc/source/development/notebooks/processor_examples/regex.ipynb index 656a39f13..23ba05917 100644 --- a/doc/source/development/notebooks/processor_examples/regex.ipynb +++ b/doc/source/development/notebooks/processor_examples/regex.ipynb @@ -13,34 +13,54 @@ "\n", "\n", "document = { \n", - " \"regex_key_one\": \"*1value2*\", \n", - " \"regex_key_two\": \"Here is 1234 in the message\" \n", + " \"regex_key_one\": \"value\", \n", + " \"regex_key_two\": \"*1value2*\" \n", "}\n", "\n", - "\n", "rule_yaml = \"\"\"\n", "---\n", - "filter: 'regex_key_one: \"/.*value.*/\" AND regex_key_two: \"/.*1234.*/\"'\n", + "filter: 'regex_key_one: \"/.*value.*/\" AND regex_key_two: \"/.*value.*/\"'\n", "regex_fields:\n", " - \"regex_key_one\"\n", " - \"regex_key_two\"\n", "pseudonymizer:\n", " mapping:\n", - " winlog.event_data.param1: \"RE_WHOLE_FIELD\"\n", - " winlog.event_data.param2: \"RE_WHOLE_FIELD\"\n", + " regex_key_one: \"RE_WHOLE_FIELD\"\n", + " regex_key_two: \"RE_WHOLE_FIELD\"\n", " description: \"...\"\n", "\"\"\"\n", "\n", - "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n", + "\n", + "\n", + "document = { \n", + " \"regex_key_one\": \"*value*\", \n", + " \"regex_key_two\": \"value\", \n", + " \"test_pseudonymizer\": \"test\", \"something_special\": \"pseudonymize_me\"\n", + "}\n", + "\n", + "rule_yaml = \"\"\"\n", + "---\n", + "filter: \"test_pseudonymizer AND something_special\"\n", + "pseudonymizer:\n", + " id: pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f\n", + " mapping:\n", + " something_special: \"RE_WHOLE_FIELD\"\n", + "description: \"...\"\n", + "\"\"\"\n", + "\n", + "\n", + "\n", + "\n", + "rule_path = Path(tempfile.gettempdir()) / \"pseudonymizer\"\n", "rule_path.mkdir(exist_ok=True)\n", "rule_file = rule_path / \"data-stream.yml\"\n", "rule_file.write_text(rule_yaml)\n", "\n", "processor_config = {\n", - " \"mydropper\": {\n", + " \"mypseudonymizer\": {\n", " \"type\": \"pseudonymizer\",\n", " \"specific_rules\": [str(rule_path)],\n", - " \"generic_rules\": [\"/dev\"],\n", + " \"generic_rules\": [\"../../../../../examples/exampledata/rules/pseudonymizer/generic/\"],\n", " \"outputs\": [{\"kafka\": \"topic\"}],\n", " \"pubkey_analyst\": \"../../../../../tests/testdata/unit/pseudonymizer/example_analyst_pub.pem\",\n", " \"pubkey_depseudo\": \"../../../../../tests/testdata/unit/pseudonymizer/example_depseudo_pub.pem\",\n", @@ -85,8 +105,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "before: {'regex_key_one': '*1value2*', 'regex_key_two': 'Here is 1234 in the message'}\n", - "after: {'regex_key_one': '*1value2*', 'regex_key_two': 'Here is 1234 in the message'}\n" + "before: {'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}\n" + ] + }, + { + "ename": "ProcessingCriticalError", + "evalue": "ProcessingCriticalError: ProcessingCriticalError: 'str' object has no attribute 'groups' -> event was send to error output and further processing stopped, rule.id='pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f', rule.description='...', event={'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:194\u001b[0m, in \u001b[0;36mProcessor._apply_rules_wrapper\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 194\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_rules\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrule\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ProcessingWarning \u001b[38;5;28;01mas\u001b[39;00m error:\n", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/processor/pseudonymizer/processor.py:261\u001b[0m, in \u001b[0;36mPseudonymizer._apply_rules\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 261\u001b[0m field_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_pseudonymize_field\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdotted_field\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfield_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 262\u001b[0m _ \u001b[38;5;241m=\u001b[39m add_field_to(event, dotted_field, field_value, overwrite_output_field\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/processor/pseudonymizer/processor.py:271\u001b[0m, in \u001b[0;36mPseudonymizer._pseudonymize_field\u001b[0;34m(self, rule, dotted_field, regex, field_value)\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_pseudonymize_field\u001b[39m(\n\u001b[1;32m 269\u001b[0m \u001b[38;5;28mself\u001b[39m, rule: PseudonymizerRule, dotted_field: \u001b[38;5;28mstr\u001b[39m, regex: Pattern, field_value: \u001b[38;5;28mstr\u001b[39m\n\u001b[1;32m 270\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[0;32m--> 271\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mregex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroups\u001b[49m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 272\u001b[0m plaintext_values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m(value \u001b[38;5;28;01mfor\u001b[39;00m value \u001b[38;5;129;01min\u001b[39;00m regex\u001b[38;5;241m.\u001b[39mfindall(field_value) \u001b[38;5;28;01mif\u001b[39;00m value)\n", + "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'groups'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mProcessingCriticalError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m mydocument \u001b[38;5;241m=\u001b[39m deepcopy(document)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbefore: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmydocument\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[43mpseudonymizer_processor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmydocument\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mafter : \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmydocument\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:161\u001b[0m, in \u001b[0;36mProcessor.process\u001b[0;34m(self, event)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data\u001b[38;5;241m.\u001b[39mclear()\n\u001b[1;32m 160\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdescribe()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m processing event \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 161\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_rule_tree\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_specific_tree\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_rule_tree(event, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_generic_tree)\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:190\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree\u001b[0;34m(self, event, tree)\u001b[0m\n\u001b[1;32m 188\u001b[0m _process_rule_tree_multiple_times(tree, event)\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 190\u001b[0m \u001b[43m_process_rule_tree_once\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtree\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:185\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree.._process_rule_tree_once\u001b[0;34m(tree, event)\u001b[0m\n\u001b[1;32m 183\u001b[0m matching_rules \u001b[38;5;241m=\u001b[39m tree\u001b[38;5;241m.\u001b[39mget_matching_rules(event)\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m rule \u001b[38;5;129;01min\u001b[39;00m matching_rules:\n\u001b[0;32m--> 185\u001b[0m \u001b[43m_process_rule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/metrics/metrics.py:207\u001b[0m, in \u001b[0;36mMetric.measure_time..without_append..inner\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 205\u001b[0m metric \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetrics, metric_name)\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m metric\u001b[38;5;241m.\u001b[39mtracker\u001b[38;5;241m.\u001b[39mlabels(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmetric\u001b[38;5;241m.\u001b[39mlabels)\u001b[38;5;241m.\u001b[39mtime():\n\u001b[0;32m--> 207\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:170\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree.._process_rule\u001b[0;34m(rule, event)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;129m@Metric\u001b[39m\u001b[38;5;241m.\u001b[39mmeasure_time()\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_process_rule\u001b[39m(rule, event):\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_rules_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrule\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 171\u001b[0m rule\u001b[38;5;241m.\u001b[39mmetrics\u001b[38;5;241m.\u001b[39mnumber_of_processed_events \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 172\u001b[0m applied_rules\u001b[38;5;241m.\u001b[39madd(rule)\n", + "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:200\u001b[0m, in \u001b[0;36mProcessor._apply_rules_wrapper\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error \u001b[38;5;66;03m# is needed to prevent wrapping it in itself\u001b[39;00m\n\u001b[1;32m 199\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m error:\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ProcessingCriticalError(\u001b[38;5;28mstr\u001b[39m(error), rule, event) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merror\u001b[39;00m\n\u001b[1;32m 201\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(rule, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelete_source_fields\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n", + "\u001b[0;31mProcessingCriticalError\u001b[0m: ProcessingCriticalError: ProcessingCriticalError: 'str' object has no attribute 'groups' -> event was send to error output and further processing stopped, rule.id='pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f', rule.description='...', event={'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}" ] } ], @@ -96,7 +138,7 @@ "\n", "print(f\"before: {mydocument}\")\n", "pseudonymizer_processor.process(mydocument)\n", - "print(f\"after: {mydocument}\")" + "print(f\"after : {mydocument}\")" ] } ], From e3a6196f332ca42529dc30a1f3f16d37262a08ad Mon Sep 17 00:00:00 2001 From: "MoessnerFabian(Group)" Date: Tue, 22 Oct 2024 15:54:46 +0200 Subject: [PATCH 13/13] Adding Notebook for lucene regex filter testing. --- .../notebooks/processor_examples/regex.ipynb | 260 ++++++++++++------ 1 file changed, 179 insertions(+), 81 deletions(-) diff --git a/doc/source/development/notebooks/processor_examples/regex.ipynb b/doc/source/development/notebooks/processor_examples/regex.ipynb index 23ba05917..62252f2db 100644 --- a/doc/source/development/notebooks/processor_examples/regex.ipynb +++ b/doc/source/development/notebooks/processor_examples/regex.ipynb @@ -1,88 +1,191 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Concatenator\n", + "\n", + "This presentations goal it to introduce the features of the `Concatenator` and how to configure it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The challenge\n", + "\n", + "I want to merge different fields from an event in one target field." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from this:" + ] + }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], + "source": [ + "document = {\n", + " 'data_stream': {\n", + " 'dataset': 'windows', \n", + " 'namespace': 'devopslab', \n", + " 'type': 'logs'\n", + " }, \n", + " '_op_type': 'create'\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "to this:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "expected = {\n", + " 'data_stream': {\n", + " 'dataset': 'windows', \n", + " 'namespace': 'devopslab', \n", + " 'type': 'logs'\n", + " }, \n", + " '_op_type': 'create', \n", + " '_index': 'logs-windows-devopslab'\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create rule and processor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "create the rule:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "250" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import sys\n", - "sys.path.append(\"../../../../../\")\n", + "sys.path.insert(0,\"../../../../../\")\n", "import tempfile\n", "from pathlib import Path\n", "\n", + "#filter: 'ip_address: \"192\\.168\\.0\\..*\"'\n", "\n", - "document = { \n", - " \"regex_key_one\": \"value\", \n", - " \"regex_key_two\": \"*1value2*\" \n", - "}\n", - "\n", - "rule_yaml = \"\"\"\n", - "---\n", - "filter: 'regex_key_one: \"/.*value.*/\" AND regex_key_two: \"/.*value.*/\"'\n", + "rule_yaml = \"\"\"---\n", + "filter: 'data_stream.type: \".*logs.*\"' \n", "regex_fields:\n", - " - \"regex_key_one\"\n", - " - \"regex_key_two\"\n", - "pseudonymizer:\n", - " mapping:\n", - " regex_key_one: \"RE_WHOLE_FIELD\"\n", - " regex_key_two: \"RE_WHOLE_FIELD\"\n", - " description: \"...\"\n", + " - \"data_stream.type\"\n", + "concatenator:\n", + " source_fields:\n", + " - data_stream.type\n", + " - data_stream.dataset\n", + " - data_stream.namespace\n", + " target_field: _index\n", + " separator: \"-\"\n", + " overwrite_target: false\n", + " delete_source_fields: false\n", "\"\"\"\n", "\n", - "\n", - "\n", - "document = { \n", - " \"regex_key_one\": \"*value*\", \n", - " \"regex_key_two\": \"value\", \n", - " \"test_pseudonymizer\": \"test\", \"something_special\": \"pseudonymize_me\"\n", - "}\n", - "\n", - "rule_yaml = \"\"\"\n", - "---\n", - "filter: \"test_pseudonymizer AND something_special\"\n", - "pseudonymizer:\n", - " id: pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f\n", - " mapping:\n", - " something_special: \"RE_WHOLE_FIELD\"\n", - "description: \"...\"\n", + "rule_yaml2 = \"\"\"---\n", + "filter: 'data_stream.type: \"/logs/\"' \n", + "concatenator:\n", + " source_fields:\n", + " - data_stream.type\n", + " - data_stream.dataset\n", + " - data_stream.namespace\n", + " target_field: _index\n", + " separator: \"-\"\n", + " overwrite_target: false\n", + " delete_source_fields: false\n", "\"\"\"\n", "\n", "\n", - "\n", - "\n", - "rule_path = Path(tempfile.gettempdir()) / \"pseudonymizer\"\n", + "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n", "rule_path.mkdir(exist_ok=True)\n", "rule_file = rule_path / \"data-stream.yml\"\n", - "rule_file.write_text(rule_yaml)\n", - "\n", + "rule_file.write_text(rule_yaml2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "create the processor config:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ "processor_config = {\n", - " \"mypseudonymizer\": {\n", - " \"type\": \"pseudonymizer\",\n", + " \"myconcatenator\":{ \n", + " \"type\": \"concatenator\",\n", " \"specific_rules\": [str(rule_path)],\n", - " \"generic_rules\": [\"../../../../../examples/exampledata/rules/pseudonymizer/generic/\"],\n", - " \"outputs\": [{\"kafka\": \"topic\"}],\n", - " \"pubkey_analyst\": \"../../../../../tests/testdata/unit/pseudonymizer/example_analyst_pub.pem\",\n", - " \"pubkey_depseudo\": \"../../../../../tests/testdata/unit/pseudonymizer/example_depseudo_pub.pem\",\n", - " \"hash_salt\": \"a_secret_tasty_ingredient\",\n", - " \"regex_mapping\": \"../../../../../tests/testdata/unit/pseudonymizer/rules/regex_mapping.yml\",\n", - " \"max_cached_pseudonyms\": 1000000\n", - " }\n", - "}" + " \"generic_rules\": [\"/dev\"],\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "create the processor with the factory:" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hallo\n" + ] + }, { "data": { "text/plain": [ - "pseudonymizer" + "concatenator" ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -92,43 +195,29 @@ "from logprep.factory import Factory\n", "\n", "mock_logger = mock.MagicMock()\n", - "pseudonymizer_processor = Factory.create(processor_config)\n", - "pseudonymizer_processor" + "concatenator = Factory.create(processor_config)\n", + "concatenator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process event" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "before: {'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}\n" - ] - }, - { - "ename": "ProcessingCriticalError", - "evalue": "ProcessingCriticalError: ProcessingCriticalError: 'str' object has no attribute 'groups' -> event was send to error output and further processing stopped, rule.id='pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f', rule.description='...', event={'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:194\u001b[0m, in \u001b[0;36mProcessor._apply_rules_wrapper\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 194\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_rules\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrule\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ProcessingWarning \u001b[38;5;28;01mas\u001b[39;00m error:\n", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/processor/pseudonymizer/processor.py:261\u001b[0m, in \u001b[0;36mPseudonymizer._apply_rules\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 261\u001b[0m field_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_pseudonymize_field\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdotted_field\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfield_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 262\u001b[0m _ \u001b[38;5;241m=\u001b[39m add_field_to(event, dotted_field, field_value, overwrite_output_field\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/processor/pseudonymizer/processor.py:271\u001b[0m, in \u001b[0;36mPseudonymizer._pseudonymize_field\u001b[0;34m(self, rule, dotted_field, regex, field_value)\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_pseudonymize_field\u001b[39m(\n\u001b[1;32m 269\u001b[0m \u001b[38;5;28mself\u001b[39m, rule: PseudonymizerRule, dotted_field: \u001b[38;5;28mstr\u001b[39m, regex: Pattern, field_value: \u001b[38;5;28mstr\u001b[39m\n\u001b[1;32m 270\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[0;32m--> 271\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mregex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroups\u001b[49m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 272\u001b[0m plaintext_values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m(value \u001b[38;5;28;01mfor\u001b[39;00m value \u001b[38;5;129;01min\u001b[39;00m regex\u001b[38;5;241m.\u001b[39mfindall(field_value) \u001b[38;5;28;01mif\u001b[39;00m value)\n", - "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'groups'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mProcessingCriticalError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m mydocument \u001b[38;5;241m=\u001b[39m deepcopy(document)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbefore: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmydocument\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[43mpseudonymizer_processor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmydocument\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mafter : \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmydocument\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:161\u001b[0m, in \u001b[0;36mProcessor.process\u001b[0;34m(self, event)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data\u001b[38;5;241m.\u001b[39mclear()\n\u001b[1;32m 160\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdescribe()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m processing event \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 161\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_rule_tree\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_specific_tree\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_rule_tree(event, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_generic_tree)\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:190\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree\u001b[0;34m(self, event, tree)\u001b[0m\n\u001b[1;32m 188\u001b[0m _process_rule_tree_multiple_times(tree, event)\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 190\u001b[0m \u001b[43m_process_rule_tree_once\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtree\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:185\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree.._process_rule_tree_once\u001b[0;34m(tree, event)\u001b[0m\n\u001b[1;32m 183\u001b[0m matching_rules \u001b[38;5;241m=\u001b[39m tree\u001b[38;5;241m.\u001b[39mget_matching_rules(event)\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m rule \u001b[38;5;129;01min\u001b[39;00m matching_rules:\n\u001b[0;32m--> 185\u001b[0m \u001b[43m_process_rule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/metrics/metrics.py:207\u001b[0m, in \u001b[0;36mMetric.measure_time..without_append..inner\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 205\u001b[0m metric \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetrics, metric_name)\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m metric\u001b[38;5;241m.\u001b[39mtracker\u001b[38;5;241m.\u001b[39mlabels(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmetric\u001b[38;5;241m.\u001b[39mlabels)\u001b[38;5;241m.\u001b[39mtime():\n\u001b[0;32m--> 207\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:170\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree.._process_rule\u001b[0;34m(rule, event)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;129m@Metric\u001b[39m\u001b[38;5;241m.\u001b[39mmeasure_time()\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_process_rule\u001b[39m(rule, event):\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_rules_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrule\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 171\u001b[0m rule\u001b[38;5;241m.\u001b[39mmetrics\u001b[38;5;241m.\u001b[39mnumber_of_processed_events \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 172\u001b[0m applied_rules\u001b[38;5;241m.\u001b[39madd(rule)\n", - "File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:200\u001b[0m, in \u001b[0;36mProcessor._apply_rules_wrapper\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error \u001b[38;5;66;03m# is needed to prevent wrapping it in itself\u001b[39;00m\n\u001b[1;32m 199\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m error:\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ProcessingCriticalError(\u001b[38;5;28mstr\u001b[39m(error), rule, event) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merror\u001b[39;00m\n\u001b[1;32m 201\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(rule, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelete_source_fields\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n", - "\u001b[0;31mProcessingCriticalError\u001b[0m: ProcessingCriticalError: ProcessingCriticalError: 'str' object has no attribute 'groups' -> event was send to error output and further processing stopped, rule.id='pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f', rule.description='...', event={'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}" + "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n", + "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n", + "False\n" ] } ], @@ -136,10 +225,19 @@ "from copy import deepcopy\n", "mydocument = deepcopy(document)\n", "\n", + "\n", "print(f\"before: {mydocument}\")\n", - "pseudonymizer_processor.process(mydocument)\n", - "print(f\"after : {mydocument}\")" + "concatenator.process(mydocument)\n", + "print(f\"after: {mydocument}\")\n", + "print(mydocument == expected)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -158,7 +256,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.5" + "version": "3.12.3" }, "vscode": { "interpreter": {