Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stetl bgt improvements #69

Merged
merged 8 commits into from
Feb 27, 2018
53 changes: 53 additions & 0 deletions stetl/filters/execfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Executes the given command and returns the captured output.
#
# Author: Frank Steggink
#
import subprocess
import os
from stetl.filter import Filter
from stetl.util import Util
from stetl.packet import FORMAT

log = Util.get_log('execfilter')


class ExecFilter(Filter):
"""
Executes any command (abstract base class).
"""

def __init__(self, configdict, section, consumes, produces):
Filter.__init__(self, configdict, section, consumes, produces)

def invoke(self, packet):
return packet

def execute_cmd(self, cmd):
use_shell = True
if os.name == 'nt':
use_shell = False

log.info("executing cmd=%s" % cmd)
result = subprocess.check_output(cmd, shell=use_shell)
log.info("execute done")
return result


class CommandExecFilter(ExecFilter):
"""
Executes an arbitrary command and captures the output

consumes=FORMAT.string, produces=FORMAT.string
"""

def __init__(self, configdict, section):
ExecFilter.__init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.string)

def invoke(self, packet):
if packet.data is not None:
packet.data = self.execute_cmd(packet.data)

return packet
63 changes: 63 additions & 0 deletions stetl/filters/regexfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Useful, an example will help, hard to grasp otherwise. Suggestions:

  • can't regexes be compiled once during init?
  • more uses expected? Maybe a baseclass RegexFilter and subclasses RegexToRecordFilter?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Compilation: good point.
More uses: I haven't thought about it yet. It is possible, but at the moment I don't have any other concrete use cases yet. When looking at the possible formats, I think only struct will be a good option. Although formats like geojson_feature, ogr_feature and etree_element could represent the parsed data, they are too specialized. The output of regexfilter, a dictionary, is not something you would typically write directly.

# -*- coding: utf-8 -*-
#
# Extracts data from a string using a regular expression and generates a record.
#
# Author: Frank Steggink

from stetl.component import Config
from stetl.filter import Filter
from stetl.packet import FORMAT
from stetl.util import Util
import re

log = Util.get_log("regexfilter")


class RegexFilter(Filter):
"""
Extracts data from a string using a regular expression and returns the named groups as a record.
consumes=FORMAT.string, produces=FORMAT.record
"""

# Start attribute config meta
# Applying Decorator pattern with the Config class to provide
# read-only config values from the configured properties.

@Config(ptype=str, default=None, required=True)
def pattern_string(self):
"""
Regex pattern string. Should contain named groups.
"""
pass

# End attribute config meta

# Constructor
def __init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.record):
Filter.__init__(self, configdict, section, consumes, produces)

self.regex_object = re.compile(self.pattern_string, re.S)

def init(self):
log.info('Init: regex filter')
if self.pattern_string is None:
# If no pattern_string is present:
err_s = 'The pattern_string needs to be configured'
log.error(err_s)
raise ValueError('The pattern_string needs to be configured')

def exit(self):
log.info('Exit: regex filter')

def invoke(self, packet):
if packet.data is None:
return packet

m = self.regex_object.match(packet.data)
if m is not None:
packet.data = m.groupdict()
else:
packet.data = {}

return packet
21 changes: 18 additions & 3 deletions stetl/filters/templatingfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ class StringTemplatingFilter(TemplatingFilter):
consumes=FORMAT.record or FORMAT.record_array, produces=FORMAT.string
"""

@Config(ptype=bool, default=False, required=False)
def safe_substitution(self):
"""
Apply safe substitution? With this method, string.Template.safe_substitute will be invoked, instead of
string.Template.substitute. If placeholders are missing from mapping and keywords, instead of raising an
exception, the original placeholder will appear in the resulting string intact.
"""
pass

def __init__(self, configdict, section):
TemplatingFilter.__init__(self, configdict, section, consumes=[FORMAT.record, FORMAT.record_array])

Expand All @@ -111,10 +120,16 @@ def create_template(self):
self.template = Template(self.template_string)

def render_template(self, packet):
if type(packet.data) is list:
packet.data = [self.template.substitute(item) for item in packet.data]
if self.safe_substitution:
if type(packet.data) is list:
packet.data = [self.template.safe_substitute(item) for item in packet.data]
else:
packet.data = self.template.safe_substitute(packet.data)
else:
packet.data = self.template.substitute(packet.data)
if type(packet.data) is list:
packet.data = [self.template.substitute(item) for item in packet.data]
else:
packet.data = self.template.substitute(packet.data)

return packet

Expand Down
9 changes: 8 additions & 1 deletion stetl/filters/zipfileextractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ def file_path(self):
"""
pass

@Config(ptype=bool, default=True, required=False)
def delete_file(self):
"""
Delete the file when the chain has been completed?
"""
pass

# End attribute config meta

# Constructor
Expand Down Expand Up @@ -58,7 +65,7 @@ def invoke(self, packet):

def after_chain_invoke(self, packet):
import os.path
if os.path.isfile(self.cur_file_path):
if os.path.isfile(self.cur_file_path) and self.delete_file:
os.remove(self.cur_file_path)

return True
1 change: 1 addition & 0 deletions tests/data/commandexecfilter.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python -c "print '{0}/{1}'.format('foo','bar')"
48 changes: 48 additions & 0 deletions tests/data/ogrinfo_output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
INFO: Open of `bgt_gebouwinstallatie.gml'
using driver `GML' successful.

Layer name: BuildingInstallation
Geometry: Curve Polygon
Feature Count: 1162
Extent: (93971.990000, 433941.050000) - (96020.190000, 436004.350000)
Layer SRS WKT:
PROJCS["Amersfoort / RD New",
GEOGCS["Amersfoort",
DATUM["Amersfoort",
SPHEROID["Bessel 1841",6377397.155,299.1528128,
AUTHORITY["EPSG","7004"]],
TOWGS84[565.2369,50.0087,465.658,-0.406857,0.350733,-1.87035,4.0812],
AUTHORITY["EPSG","6289"]],
PRIMEM["Greenwich",0,
AUTHORITY["EPSG","8901"]],
UNIT["degree",0.0174532925199433,
AUTHORITY["EPSG","9122"]],
AXIS["Latitude",NORTH],
AXIS["Longitude",EAST],
AUTHORITY["EPSG","4289"]],
PROJECTION["Oblique_Stereographic"],
PARAMETER["latitude_of_origin",52.15616055555555],
PARAMETER["central_meridian",5.38763888888889],
PARAMETER["scale_factor",0.9999079],
PARAMETER["false_easting",155000],
PARAMETER["false_northing",463000],
UNIT["metre",1,
AUTHORITY["EPSG","9001"]],
AXIS["X",EAST],
AXIS["Y",NORTH],
AUTHORITY["EPSG","28992"]]
gml_id: String (0.0) NOT NULL
creationDate: String (10.0)
LV-publicatiedatum: String (23.0)
relatieveHoogteligging: Integer (0.0)
inOnderzoek: Integer(Boolean) (0.0)
tijdstipRegistratie: String (23.0)
namespace: String (8.0)
lokaalID: String (38.0)
bronhouder: String (5.0)
bgt-status: String (8.0)
plus-status: String (10.0)
function: String (8.0)
plus-typeGebouwInstallatie: String (12.0)
terminationDate: String (10.0)
eindRegistratie: String (23.0)
17 changes: 17 additions & 0 deletions tests/filters/configs/commandexecfilter.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Config file for unit testing CommandExecFilter.

[etl]
chains = input_string_file|command_executor|packet_buffer|output_std

[input_string_file]
class = inputs.fileinput.StringFileInput
file_path = tests/data/commandexecfilter.txt

[command_executor]
class = filters.execfilter.CommandExecFilter

[packet_buffer]
class = filters.packetbuffer.PacketBuffer

[output_std]
class = outputs.standardoutput.StandardOutput
18 changes: 18 additions & 0 deletions tests/filters/configs/regexfilter.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Config file for unit testing RegexFilter.

[etl]
chains = input_string_file|regex_filter|packet_buffer|output_std

[input_string_file]
class = inputs.fileinput.StringFileInput
file_path = tests/data/ogrinfo_output.txt

[regex_filter]
class = filters.regexfilter.RegexFilter
pattern_string = .*Layer name: (\w+:)?(?P<elemtype>\w+).*Feature Count: (?P<featurecount>[0-9]+).*

[packet_buffer]
class = filters.packetbuffer.PacketBuffer

[output_std]
class = outputs.standardoutput.StandardOutput
38 changes: 38 additions & 0 deletions tests/filters/test_command_exec_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os

from stetl.etl import ETL
from stetl.filters.packetbuffer import PacketBuffer
from stetl.filters.execfilter import CommandExecFilter
from tests.stetl_test_case import StetlTestCase

class CommandExecFilterTest(StetlTestCase):
"""Unit tests for CommandExecFilter"""

def setUp(self):
super(CommandExecFilterTest, self).setUp()

# Initialize Stetl
curr_dir = os.path.dirname(os.path.realpath(__file__))
cfg_dict = {'config_file': os.path.join(curr_dir, 'configs/commandexecfilter.cfg')}
self.etl = ETL(cfg_dict)

def test_class(self):
chain = StetlTestCase.get_chain(self.etl)
section = StetlTestCase.get_section(chain, 1)
class_name = self.etl.configdict.get(section, 'class')

self.assertEqual('filters.execfilter.CommandExecFilter', class_name)

def test_instance(self):
chain = StetlTestCase.get_chain(self.etl)

self.assertTrue(isinstance(chain.get_by_index(1), CommandExecFilter))

def test_execute(self):
chain = StetlTestCase.get_chain(self.etl)
chain.run()

buffer_filter = chain.get_by_class(PacketBuffer)
packet_list = buffer_filter.packet_list

self.assertEqual(packet_list[0].data.strip(), "foo/bar")
38 changes: 38 additions & 0 deletions tests/filters/test_regex_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os

from stetl.etl import ETL
from stetl.filters.packetbuffer import PacketBuffer
from stetl.filters.regexfilter import RegexFilter
from tests.stetl_test_case import StetlTestCase

class RegexFilterTest(StetlTestCase):
"""Unit tests for RegexFilter"""

def setUp(self):
super(RegexFilterTest, self).setUp()

# Initialize Stetl
curr_dir = os.path.dirname(os.path.realpath(__file__))
cfg_dict = {'config_file': os.path.join(curr_dir, 'configs/regexfilter.cfg')}
self.etl = ETL(cfg_dict)

def test_class(self):
chain = StetlTestCase.get_chain(self.etl)
section = StetlTestCase.get_section(chain, 1)
class_name = self.etl.configdict.get(section, 'class')

self.assertEqual('filters.regexfilter.RegexFilter', class_name)

def test_instance(self):
chain = StetlTestCase.get_chain(self.etl)

self.assertTrue(isinstance(chain.get_by_index(1), RegexFilter))

def test_execute(self):
chain = StetlTestCase.get_chain(self.etl)
chain.run()

buffer_filter = chain.get_by_class(PacketBuffer)
packet_list = buffer_filter.packet_list

self.assertEqual(str(packet_list[0].data), "{'elemtype': 'BuildingInstallation', 'featurecount': '1162'}")