From 030521cecc340087e343cccc369dc0fabacc69df Mon Sep 17 00:00:00 2001 From: Frank Steggink Date: Thu, 8 Feb 2018 18:03:00 +0100 Subject: [PATCH 1/8] Make deletion of extracted files optional in ZipFileExtractor --- stetl/filters/zipfileextractor.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/stetl/filters/zipfileextractor.py b/stetl/filters/zipfileextractor.py index 680640b..a614e71 100644 --- a/stetl/filters/zipfileextractor.py +++ b/stetl/filters/zipfileextractor.py @@ -28,6 +28,13 @@ def file_path(self): File name to write the extracted file to. """ pass + + @Config(ptype=bool, default=True, required=False) + def delete_file(self): + """ + Delete the file when the chain has been completed? + """ + pass # End attribute config meta @@ -58,7 +65,7 @@ def invoke(self, packet): def after_chain_invoke(self, packet): import os.path - if os.path.isfile(self.cur_file_path): + if os.path.isfile(self.cur_file_path) and self.delete_file: os.remove(self.cur_file_path) return True From c6f5e5a9693e69ca67d6427bb5f50c00592bfa90 Mon Sep 17 00:00:00 2001 From: Frank Steggink Date: Thu, 8 Feb 2018 18:04:06 +0100 Subject: [PATCH 2/8] Added option to use safe_substitute instead of substitute in StringTemplatingFilter --- stetl/filters/templatingfilter.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/stetl/filters/templatingfilter.py b/stetl/filters/templatingfilter.py index 6fbedfc..e506a2c 100644 --- a/stetl/filters/templatingfilter.py +++ b/stetl/filters/templatingfilter.py @@ -94,6 +94,13 @@ class StringTemplatingFilter(TemplatingFilter): consumes=FORMAT.record or FORMAT.record_array, produces=FORMAT.string """ + @Config(ptype=bool, default=False, required=False) + def safe_substitution(self): + """ + Apply safe substitution? + """ + pass + def __init__(self, configdict, section): TemplatingFilter.__init__(self, configdict, section, consumes=[FORMAT.record, FORMAT.record_array]) @@ -111,10 +118,16 @@ def create_template(self): self.template = Template(self.template_string) def render_template(self, packet): - if type(packet.data) is list: - packet.data = [self.template.substitute(item) for item in packet.data] + if self.safe_substitution: + if type(packet.data) is list: + packet.data = [self.template.safe_substitute(item) for item in packet.data] + else: + packet.data = self.template.safe_substitute(packet.data) else: - packet.data = self.template.substitute(packet.data) + if type(packet.data) is list: + packet.data = [self.template.substitute(item) for item in packet.data] + else: + packet.data = self.template.substitute(packet.data) return packet From 2c062288da6b3a725634f825d4ce08df1d3da02d Mon Sep 17 00:00:00 2001 From: Frank Steggink Date: Thu, 8 Feb 2018 18:08:44 +0100 Subject: [PATCH 3/8] Added new ExecFilter, which executes a command and returns it output as packet data --- stetl/filters/execfilter.py | 54 +++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 stetl/filters/execfilter.py diff --git a/stetl/filters/execfilter.py b/stetl/filters/execfilter.py new file mode 100644 index 0000000..ce07d65 --- /dev/null +++ b/stetl/filters/execfilter.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Executes the given command and returns the captured output. +# +# Author: Frank Steggink +# +import subprocess +import os +import shutil +from stetl.filter import Filter +from stetl.util import Util +from stetl.packet import FORMAT + +log = Util.get_log('execfilter') + + +class ExecFilter(Filter): + """ + Executes any command (abstract base class). + """ + + def __init__(self, configdict, section, consumes, produces): + Filter.__init__(self, configdict, section, consumes, produces) + + def invoke(self, packet): + return packet + + def execute_cmd(self, cmd): + use_shell = True + if os.name == 'nt': + use_shell = False + + log.info("executing cmd=%s" % cmd) + result = subprocess.check_output(cmd, shell=use_shell) + log.info("execute done") + return result + + +class CommandExecFilter(ExecFilter): + """ + Executes an arbitrary command and captures the output + + consumes=FORMAT.string, produces=FORMAT.string + """ + + def __init__(self, configdict, section): + ExecFilter.__init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.string) + + def invoke(self, packet): + if packet.data is not None: + packet.data = self.execute_cmd(packet.data) + + return packet \ No newline at end of file From bf020b4cd3d2ab3fa037b045aa2029089fc2ab0d Mon Sep 17 00:00:00 2001 From: Frank Steggink Date: Thu, 8 Feb 2018 18:13:08 +0100 Subject: [PATCH 4/8] Added new RegexFilter, which parses data from a string using a regex with named groups. The extracted data is returned as a record. --- stetl/filters/regexfilter.py | 61 ++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 stetl/filters/regexfilter.py diff --git a/stetl/filters/regexfilter.py b/stetl/filters/regexfilter.py new file mode 100644 index 0000000..c2b37b5 --- /dev/null +++ b/stetl/filters/regexfilter.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Extracts data from a string using a regular expression and generates a record. +# +# Author: Frank Steggink + +from stetl.component import Config +from stetl.filter import Filter +from stetl.packet import FORMAT +from stetl.util import Util +import re + +log = Util.get_log("regexfilter") + + +class RegexFilter(Filter): + """ + Extracts data from a string using a regular expression and returns the named groups as a record. + consumes=FORMAT.string, produces=FORMAT.record + """ + + # Start attribute config meta + # Applying Decorator pattern with the Config class to provide + # read-only config values from the configured properties. + + @Config(ptype=str, default=None, required=True) + def pattern_string(self): + """ + Regex pattern string. Should contain named groups. + """ + pass + + # End attribute config meta + + # Constructor + def __init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.record): + Filter.__init__(self, configdict, section, consumes, produces) + + def init(self): + log.info('Init: regex filter') + if self.pattern_string is None: + # If no pattern_string is present: + err_s = 'The pattern_string needs to be configured' + log.error(err_s) + raise ValueError('The pattern_string needs to be configured') + + def exit(self): + log.info('Exit: regex filter') + + def invoke(self, packet): + if packet.data is None: + return packet + + m = re.match(self.pattern_string, packet.data, re.S) + if m is not None: + packet.data = m.groupdict() + else: + packet.data = {} + + return packet From a5bfef6b9d98123cb822544395c621a2158cbc17 Mon Sep 17 00:00:00 2001 From: Frank Steggink Date: Tue, 27 Feb 2018 09:54:44 +0100 Subject: [PATCH 5/8] Solved some flake8 issues --- stetl/filters/execfilter.py | 3 +-- stetl/filters/zipfileextractor.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/stetl/filters/execfilter.py b/stetl/filters/execfilter.py index ce07d65..6014347 100644 --- a/stetl/filters/execfilter.py +++ b/stetl/filters/execfilter.py @@ -7,7 +7,6 @@ # import subprocess import os -import shutil from stetl.filter import Filter from stetl.util import Util from stetl.packet import FORMAT @@ -51,4 +50,4 @@ def invoke(self, packet): if packet.data is not None: packet.data = self.execute_cmd(packet.data) - return packet \ No newline at end of file + return packet diff --git a/stetl/filters/zipfileextractor.py b/stetl/filters/zipfileextractor.py index a614e71..16fd8fd 100644 --- a/stetl/filters/zipfileextractor.py +++ b/stetl/filters/zipfileextractor.py @@ -28,7 +28,7 @@ def file_path(self): File name to write the extracted file to. """ pass - + @Config(ptype=bool, default=True, required=False) def delete_file(self): """ From 4d75bba26ce62debf83c9ced6841e2a7dd4b6f18 Mon Sep 17 00:00:00 2001 From: Frank Steggink Date: Tue, 27 Feb 2018 13:37:15 +0100 Subject: [PATCH 6/8] Small improvements, per Just's recommendations --- stetl/filters/regexfilter.py | 4 +++- stetl/filters/templatingfilter.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/stetl/filters/regexfilter.py b/stetl/filters/regexfilter.py index c2b37b5..c66e2db 100644 --- a/stetl/filters/regexfilter.py +++ b/stetl/filters/regexfilter.py @@ -37,6 +37,8 @@ def pattern_string(self): def __init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.record): Filter.__init__(self, configdict, section, consumes, produces) + self.regex_object = re.compile(self.pattern_string, re.S) + def init(self): log.info('Init: regex filter') if self.pattern_string is None: @@ -52,7 +54,7 @@ def invoke(self, packet): if packet.data is None: return packet - m = re.match(self.pattern_string, packet.data, re.S) + m = self.regex_object.match(packet.data) if m is not None: packet.data = m.groupdict() else: diff --git a/stetl/filters/templatingfilter.py b/stetl/filters/templatingfilter.py index e506a2c..0790881 100644 --- a/stetl/filters/templatingfilter.py +++ b/stetl/filters/templatingfilter.py @@ -97,7 +97,9 @@ class StringTemplatingFilter(TemplatingFilter): @Config(ptype=bool, default=False, required=False) def safe_substitution(self): """ - Apply safe substitution? + Apply safe substitution? With this method, string.Template.safe_substitute will be invoked, instead of + string.Template.substitute. If placeholders are missing from mapping and keywords, instead of raising an + exception, the original placeholder will appear in the resulting string intact. """ pass From 3db96556f079200a3c84a0584fd4b34a3aa65302 Mon Sep 17 00:00:00 2001 From: Frank Steggink Date: Tue, 27 Feb 2018 14:04:23 +0100 Subject: [PATCH 7/8] Added unit test for RegexFilter --- tests/data/ogrinfo_output.txt | 48 +++++++++++++++++++++++++++ tests/filters/configs/regexfilter.cfg | 18 ++++++++++ tests/filters/test_regex_filter.py | 38 +++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 tests/data/ogrinfo_output.txt create mode 100644 tests/filters/configs/regexfilter.cfg create mode 100644 tests/filters/test_regex_filter.py diff --git a/tests/data/ogrinfo_output.txt b/tests/data/ogrinfo_output.txt new file mode 100644 index 0000000..71317d5 --- /dev/null +++ b/tests/data/ogrinfo_output.txt @@ -0,0 +1,48 @@ +INFO: Open of `bgt_gebouwinstallatie.gml' + using driver `GML' successful. + +Layer name: BuildingInstallation +Geometry: Curve Polygon +Feature Count: 1162 +Extent: (93971.990000, 433941.050000) - (96020.190000, 436004.350000) +Layer SRS WKT: +PROJCS["Amersfoort / RD New", + GEOGCS["Amersfoort", + DATUM["Amersfoort", + SPHEROID["Bessel 1841",6377397.155,299.1528128, + AUTHORITY["EPSG","7004"]], + TOWGS84[565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812], + AUTHORITY["EPSG","6289"]], + PRIMEM["Greenwich",0, + AUTHORITY["EPSG","8901"]], + UNIT["degree",0.0174532925199433, + AUTHORITY["EPSG","9122"]], + AXIS["Latitude",NORTH], + AXIS["Longitude",EAST], + AUTHORITY["EPSG","4289"]], + PROJECTION["Oblique_Stereographic"], + PARAMETER["latitude_of_origin",52.15616055555555], + PARAMETER["central_meridian",5.38763888888889], + PARAMETER["scale_factor",0.9999079], + PARAMETER["false_easting",155000], + PARAMETER["false_northing",463000], + UNIT["metre",1, + AUTHORITY["EPSG","9001"]], + AXIS["X",EAST], + AXIS["Y",NORTH], + AUTHORITY["EPSG","28992"]] +gml_id: String (0.0) NOT NULL +creationDate: String (10.0) +LV-publicatiedatum: String (23.0) +relatieveHoogteligging: Integer (0.0) +inOnderzoek: Integer(Boolean) (0.0) +tijdstipRegistratie: String (23.0) +namespace: String (8.0) +lokaalID: String (38.0) +bronhouder: String (5.0) +bgt-status: String (8.0) +plus-status: String (10.0) +function: String (8.0) +plus-typeGebouwInstallatie: String (12.0) +terminationDate: String (10.0) +eindRegistratie: String (23.0) diff --git a/tests/filters/configs/regexfilter.cfg b/tests/filters/configs/regexfilter.cfg new file mode 100644 index 0000000..33de23c --- /dev/null +++ b/tests/filters/configs/regexfilter.cfg @@ -0,0 +1,18 @@ +# Config file for unit testing RegexFilter. + +[etl] +chains = input_string_file|regex_filter|packet_buffer|output_std + +[input_string_file] +class = inputs.fileinput.StringFileInput +file_path = tests/data/ogrinfo_output.txt + +[regex_filter] +class = filters.regexfilter.RegexFilter +pattern_string = .*Layer name: (\w+:)?(?P\w+).*Feature Count: (?P[0-9]+).* + +[packet_buffer] +class = filters.packetbuffer.PacketBuffer + +[output_std] +class = outputs.standardoutput.StandardOutput diff --git a/tests/filters/test_regex_filter.py b/tests/filters/test_regex_filter.py new file mode 100644 index 0000000..15e8937 --- /dev/null +++ b/tests/filters/test_regex_filter.py @@ -0,0 +1,38 @@ +import os + +from stetl.etl import ETL +from stetl.filters.packetbuffer import PacketBuffer +from stetl.filters.regexfilter import RegexFilter +from tests.stetl_test_case import StetlTestCase + +class RegexFilterTest(StetlTestCase): + """Unit tests for RegexFilter""" + + def setUp(self): + super(RegexFilterTest, self).setUp() + + # Initialize Stetl + curr_dir = os.path.dirname(os.path.realpath(__file__)) + cfg_dict = {'config_file': os.path.join(curr_dir, 'configs/regexfilter.cfg')} + self.etl = ETL(cfg_dict) + + def test_class(self): + chain = StetlTestCase.get_chain(self.etl) + section = StetlTestCase.get_section(chain, 1) + class_name = self.etl.configdict.get(section, 'class') + + self.assertEqual('filters.regexfilter.RegexFilter', class_name) + + def test_instance(self): + chain = StetlTestCase.get_chain(self.etl) + + self.assertTrue(isinstance(chain.get_by_index(1), RegexFilter)) + + def test_execute(self): + chain = StetlTestCase.get_chain(self.etl) + chain.run() + + buffer_filter = chain.get_by_class(PacketBuffer) + packet_list = buffer_filter.packet_list + + self.assertEqual(str(packet_list[0].data), "{'elemtype': 'BuildingInstallation', 'featurecount': '1162'}") From 07670a6f99e20faf17474b8b94541bb5db0a8daa Mon Sep 17 00:00:00 2001 From: Frank Steggink Date: Tue, 27 Feb 2018 14:50:20 +0100 Subject: [PATCH 8/8] Added unit test for CommandExecFilter --- tests/data/commandexecfilter.txt | 1 + tests/filters/configs/commandexecfilter.cfg | 17 +++++++++ tests/filters/test_command_exec_filter.py | 38 +++++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 tests/data/commandexecfilter.txt create mode 100644 tests/filters/configs/commandexecfilter.cfg create mode 100644 tests/filters/test_command_exec_filter.py diff --git a/tests/data/commandexecfilter.txt b/tests/data/commandexecfilter.txt new file mode 100644 index 0000000..4a7a5d5 --- /dev/null +++ b/tests/data/commandexecfilter.txt @@ -0,0 +1 @@ +python -c "print '{0}/{1}'.format('foo','bar')" \ No newline at end of file diff --git a/tests/filters/configs/commandexecfilter.cfg b/tests/filters/configs/commandexecfilter.cfg new file mode 100644 index 0000000..f047801 --- /dev/null +++ b/tests/filters/configs/commandexecfilter.cfg @@ -0,0 +1,17 @@ +# Config file for unit testing CommandExecFilter. + +[etl] +chains = input_string_file|command_executor|packet_buffer|output_std + +[input_string_file] +class = inputs.fileinput.StringFileInput +file_path = tests/data/commandexecfilter.txt + +[command_executor] +class = filters.execfilter.CommandExecFilter + +[packet_buffer] +class = filters.packetbuffer.PacketBuffer + +[output_std] +class = outputs.standardoutput.StandardOutput diff --git a/tests/filters/test_command_exec_filter.py b/tests/filters/test_command_exec_filter.py new file mode 100644 index 0000000..1112bd7 --- /dev/null +++ b/tests/filters/test_command_exec_filter.py @@ -0,0 +1,38 @@ +import os + +from stetl.etl import ETL +from stetl.filters.packetbuffer import PacketBuffer +from stetl.filters.execfilter import CommandExecFilter +from tests.stetl_test_case import StetlTestCase + +class CommandExecFilterTest(StetlTestCase): + """Unit tests for CommandExecFilter""" + + def setUp(self): + super(CommandExecFilterTest, self).setUp() + + # Initialize Stetl + curr_dir = os.path.dirname(os.path.realpath(__file__)) + cfg_dict = {'config_file': os.path.join(curr_dir, 'configs/commandexecfilter.cfg')} + self.etl = ETL(cfg_dict) + + def test_class(self): + chain = StetlTestCase.get_chain(self.etl) + section = StetlTestCase.get_section(chain, 1) + class_name = self.etl.configdict.get(section, 'class') + + self.assertEqual('filters.execfilter.CommandExecFilter', class_name) + + def test_instance(self): + chain = StetlTestCase.get_chain(self.etl) + + self.assertTrue(isinstance(chain.get_by_index(1), CommandExecFilter)) + + def test_execute(self): + chain = StetlTestCase.get_chain(self.etl) + chain.run() + + buffer_filter = chain.get_by_class(PacketBuffer) + packet_list = buffer_filter.packet_list + + self.assertEqual(packet_list[0].data.strip(), "foo/bar")