Merge pull request #17 from sarnold/id-munging

Id munging and cleanup
sarnold · Jul 6, 2024 · b3c7976 · b3c7976
2 parents 01f0009 + 5905052
commit b3c7976
Show file tree

Hide file tree

Showing 22 changed files with 1,640 additions and 119 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,7 +22,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04, macos-latest, windows-latest]
-        python-version: [3.8, 3.9, '3.10', '3.11']
+        python-version: ['3.9', '3.10', '3.11', '3.12']
     steps:
     - name: Set git crlf/eol
       run: |

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -19,7 +19,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04, macos-latest, windows-latest]
-        python-version: [3.8, 3.9, '3.10', '3.12']
+        python-version: ['3.9', '3.10', '3.11', '3.12']
 
     steps:
     - name: Set git crlf/eol

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 #   pre-commit install
 # To update the pre-commit hooks run:
 #   pre-commit install-hooks
-exclude: '^(.tox/|docs/|^setup.py$)'
+exclude: '^(.tox/|docs/|^setup.py$|^tests/data/controls.csv$)'
 repos:
   - repo: meta
     hooks:
@@ -50,6 +50,7 @@ repos:
         - natsort
         - munch
         - munch-stubs
+        - PyYAML
         - ruamel.yaml
         - dpath
         - nested_lookup
@@ -58,7 +59,7 @@ repos:
         - --follow-imports=normal
         - --install-types
         - --non-interactive
-      files: src/ymltoxml/
+      files: src/yaml_tools/
 
   - repo: "https://github.com/asottile/blacken-docs"
     rev: "1.16.0"
@@ -89,7 +90,7 @@ repos:
     rev: v2.3.1
     hooks:
       - id: autoflake
-        files: src/ymltoxml/
+        files: src/yaml_tools/
         args:
           - --in-place
           - --remove-all-unused-imports
@@ -100,15 +101,15 @@ repos:
     rev: 7.0.0
     hooks:
       - id: flake8
-        files: src/ymltoxml/
+        files: src/yaml_tools/
         additional_dependencies: ["flake8-bugbear"]
 
   - repo: https://github.com/PyCQA/bandit
     rev: 1.7.8
     hooks:
       - id: bandit
         args: ["-ll", "-q"]
-        files: src/ymltoxml/
+        files: src/yaml_tools/
 
 #  - repo: https://github.com/lovesegfault/beautysh
 #    rev: v6.2.1

diff --git a/README.rst b/README.rst
@@ -18,7 +18,7 @@ input file types to ingest SSG and other upstream data, eg, NIST
 oscal-content_.
 
 .. _SCAP Security Guide: https://github.com/ComplianceAsCode/content
-.. _oscal-content: https://github.com/usnistgov/oscal-content.git
+.. _oscal-content: https://github.com/usnistgov/oscal-content
 
 Quick Start
 ===========
@@ -31,7 +31,8 @@ Available modules, console commands, and scripts:
 * ``oscal`` (*WIP*) - ingest NIST 800-53 content in multiple formats
 
 Experimental "demo" scripts:
-* ``analyze_control_ids.py`` - analyze control ID sets with fuzzy match
+
+* ``analyze_control_ids.py`` - analyze control ID sets with optional fuzzy match
 * ``analyze_ssg_controls.py`` - analyze NIST controls from SSG content
 
 For the above "demo" scripts, check the top of the source file for any knobs
@@ -53,7 +54,7 @@ Install with pip
 This package is *not* yet published on PyPI, thus use one of the following
 to install yaml-tools on any platform. Install from the main branch::
 
-  $ https://github.com/sarnold/yaml-tools/archive/refs/heads/main.tar.gz
+  $ pip install https://github.com/sarnold/yaml-tools/archive/refs/heads/main.tar.gz
 
 or use this command to install a specific release version::
 
@@ -236,8 +237,31 @@ Default yasort.yaml:
 Features and limitations
 ------------------------
 
+**NIST control ID munging**
+
+The demo scripts and ``oscal`` module illustrate various forms of control ID
+normalization in order to match control IDs from multiple input sources.
+Currently, there are 2 primary ID formats, and which one to use is mainly
+a user choice:
+
+* AC-01(01)(a) - uppercase with parentheses
+* ac-01.01.a - lowercase with dots
+
+Nested controls follow the upstream_ pattern of alternating lettered and
+numbered sub-bullets for each level, and the latest rev5 controls add
+leading zeros.
+
+The ``xform_id`` function is *idempotent* with the following caveats:
+
+* extraneous whitespace is always dropped
+* leading zeros are added to single digit values where needed
+
+.. _upstream: https://github.com/usnistgov/oscal-content
+
+**XML <==> YAML** conversion
+
 We mainly test ymltoxml on mavlink XML message definitions and NIST/SSG
-YAML files, so round-trip conversion *may not* work at all on
+content files, so round-trip conversion *may not* work at all on
 arbitrarily complex XML files with namespaces, etc.  The current
 round-trip is not exact, due to the following:
 
@@ -328,6 +352,10 @@ to specify the Python version and host OS type, run something like::
 
   $ tox -e py39-linux
 
+To generate a coverage file, run something like the following::
+
+  $ tox -e py,coverage
+
 Additional ``tox`` commands:
 
 * ``tox -e changes`` (re)generate the changelog file

diff --git a/scripts/csvchk.py b/scripts/csvchk.py
@@ -2,15 +2,16 @@
 Simple consumer test.
 """
 
-from ymltoxml.utils import text_data_writer, text_file_reader
+from yaml_tools.utils import text_data_writer, text_file_reader
 
 OPTS = {
     'file_encoding': 'utf-8',
     'output_format': 'csv',
+    'default_csv_hdr': None,
 }
 
 
 # read in some json "column data"
-data = text_file_reader('tests/data/catalog.json', OPTS)
+data = text_file_reader('tests/data/controls.yml', OPTS)
 # spit out CSV records
 ret = text_data_writer(data, OPTS)
diff --git a/scripts/genxml.py b/scripts/genxml.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 from pathlib import Path
 
 import xmltodict
@@ -22,10 +21,7 @@ def post_process(xml_str):
     data = yaml.load(yfile, Loader=yaml.Loader)
 
 outfile = Path('out.xml')
-xml = xmltodict.unparse(data,
-                        short_empty_elements=False,
-                        pretty=True,
-                        indent='  ')
+xml = xmltodict.unparse(data, short_empty_elements=False, pretty=True, indent='  ')
 
 new_xml = post_process(xml)
 

diff --git a/scripts/genyaml.py b/scripts/genyaml.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 from pathlib import Path
 
 import xmltodict
@@ -11,6 +10,7 @@ class StrYAML(YAML):
     New API likes dumping straight to file/stdout, so we subclass and
     create 'inefficient' custom string dumper.  <shrug>
     """
+
     def dump(self, data, stream=None, **kw):
         inefficient = False
         if stream is None:

diff --git a/scripts/xform_idchk.py b/scripts/xform_idchk.py
@@ -4,8 +4,8 @@
 
 from natsort import os_sorted
 
-from ymltoxml.templates import xform_id
-from ymltoxml.utils import text_file_reader
+from yaml_tools.templates import xform_id
+from yaml_tools.utils import text_file_reader
 
 OPTS = {
     'file_encoding': 'utf-8',

diff --git a/src/yaml_tools/data/oscal.yaml b/src/yaml_tools/data/oscal.yaml
@@ -10,7 +10,9 @@ default_ssg_path: 'ext/content/controls'
 default_control_attr: 'status'
 default_lookup_key: 'controls'
 default_csv_hdr: null
+new_csv_file: null
 new_csv_hdrs: []
+csv_delimiter: null
 input_format: null
 output_format: 'json'
 output_path: null

diff --git a/src/yaml_tools/oscal.py b/src/yaml_tools/oscal.py
@@ -11,6 +11,7 @@
 
 from munch import Munch
 from natsort import os_sorted
+
 from nested_lookup import nested_lookup
 
 from .templates import xform_id
@@ -32,31 +33,54 @@ def csv_append_id_data(in_ids, prog_opts, uargs):  # pragma: no cover
     the given filename with ``.modified`` appended to the filename stem.
     """
     mpath = Path(uargs.munge)
-    opath = Path('.').joinpath(mpath.stem)
+    opath = (
+        Path(prog_opts['new_csv_file'])
+        if prog_opts['new_csv_file']
+        else Path('.').joinpath(mpath.stem)
+    )
     new_opath = opath.with_suffix('.modified.csv')
+    delim = prog_opts['csv_delimiter'] if prog_opts['csv_delimiter'] else ';'
     if uargs.verbose:
         print(f'Writing munged csv data to {new_opath}')
 
-    writer = csv.writer(open(new_opath, 'w', newline='', encoding='utf-8'))
-    reader = csv.reader(open(uargs.munge, 'r', newline='', encoding='utf-8'))
+    writer = csv.writer(
+        open(new_opath, 'w', newline='', encoding='utf-8'),
+        delimiter=delim,
+    )
+    reader = csv.reader(
+        open(uargs.munge, 'r', newline='', encoding='utf-8'),
+        delimiter=delim,
+    )
     headers = next(reader)
     for hdr in prog_opts['new_csv_hdrs']:
         headers.append(hdr)
     writer.writerow(headers)
 
     for ctl in reader:
-        ctl_id = xform_id(ctl[0])
-        sub_ids = [s for s in in_ids if ctl_id in s]
-        if ctl_id in in_ids:
-            ctl.append('Y')
-        elif sub_ids != []:
-            ctl.append(sub_ids[0])
-        else:
-            ctl.append('N')
-        ctl.append(ctl_id)
+        ctl = csv_row_match(in_ids, ctl)
         writer.writerow(ctl)
 
 
+def csv_row_match(in_ids, ctl):
+    """
+    Extracted ctl munging from ``csv_append_id_data`` loop for testing.
+
+    :param: ctl
+    :type ctl: csv row data
+    :return ctl: munged ctl
+    """
+    ctl_id = xform_id(ctl[0])
+    sub_ids = [s for s in in_ids if ctl_id in s]
+    if ctl_id in in_ids:
+        ctl.append('Y')
+    elif sub_ids != []:
+        ctl.append(sub_ids[0])
+    else:
+        ctl.append('N')
+    ctl.append(ctl_id)
+    return ctl
+
+
 def load_input_data(filepath, prog_opts, use_ssg=False, debug=False):
     """
     Find and gather the inputs, ie, content file(s) and user control IDs,
@@ -76,7 +100,8 @@ def load_input_data(filepath, prog_opts, use_ssg=False, debug=False):
     if use_ssg:
         prog_opts['default_content_path'] = prog_opts['default_ssg_path']
         prog_opts['default_profile_glob'] = prog_opts['default_ssg_glob']
-    else:
+
+    if debug:
         print(f"Loading content from: {prog_opts['default_content_path']}")
 
     ctl_files = get_filelist(
@@ -120,30 +145,33 @@ def load_input_data(filepath, prog_opts, use_ssg=False, debug=False):
     return in_ids, id_queue, ctl_queue
 
 
+def munge_file(filepath, prog_opts, uargs):
+    """
+    Munge a CSV file by appending columns.
+    """
+    input_ids = text_file_reader(filepath, prog_opts)
+    csv_append_id_data(input_ids, prog_opts=prog_opts, uargs=uargs)
+
+
 def process_data(filepath, prog_opts, uargs):
     """
     Process inputs, print some output.
     """
-    if uargs.munge:
-        input_ids = text_file_reader(filepath, prog_opts)
-        csv_append_id_data(input_ids, prog_opts=prog_opts, uargs=uargs)
-    else:
-        input_ids, id_queue, ctl_queue = load_input_data(
-            filepath, prog_opts, use_ssg=uargs.ssg, debug=uargs.verbose
-        )
-        in_list, not_in_list = id_set_match(input_ids, id_queue, uargs=uargs)
+    input_ids, id_queue, ctl_queue = load_input_data(
+        filepath, prog_opts, use_ssg=uargs.ssg, debug=uargs.verbose
+    )
+    in_list, _ = id_set_match(input_ids, id_queue, uargs=uargs)
+
+    if not uargs.quiet:
         print(f'\nControl queue has {len(ctl_queue)} items')
-        rpt_attr = (
-            prog_opts['default_control_attr']
-            if prog_opts['default_control_attr']
-            else uargs.attribute
-        )
-        if uargs.verbose:
-            print(f'Checking input IDs: {in_list}')
-        print(f'\nID,{rpt_attr}')
-        for ctl in ctl_queue:
-            if ctl[0] in in_list:
-                print(f'{ctl[0]},{ctl[1][rpt_attr]}')
+
+    rpt_attr = uargs.attribute if uargs.attribute else prog_opts['default_control_attr']
+    if uargs.verbose:
+        print(f'Checking input IDs: {in_list}')
+    print(f'\nID;{rpt_attr}')
+    for ctl in ctl_queue:
+        if ctl[0] in in_list:
+            print(f'{ctl[0]};{ctl[1][rpt_attr]}')
 
 
 def ssg_ctrl_from_nist(in_id, prog_opts, uargs):
@@ -264,6 +292,12 @@ def main(argv=None):  # pragma: no cover
         action='store_true',
         help='display more processing info',
     )
+    parser.add_argument(
+        '-q',
+        '--quiet',
+        action='store_true',
+        help='display less processing info',
+    )
     parser.add_argument(
         '-m',
         '--munge-file',
@@ -274,7 +308,7 @@ def main(argv=None):  # pragma: no cover
         default=None,
     )
     parser.add_argument(
-        '-R',
+        '-r',
         '--report-attribute',
         metavar="ATTR",
         type=str,
@@ -324,12 +358,13 @@ def main(argv=None):  # pragma: no cover
     if not Path(infile).exists():
         print(f'Input file {infile} not found!')
         sys.exit(1)
+    if args.munge:
+        munge_file(infile, popts, args)
 
     if args.verbose:
         print(f"Path to content: {cfg.default_content_path}")
         print(f"Content file glob: {cfg.default_profile_glob}")
-        print(f"Input file: {infile}")
-    else:
+    if not args.quiet:
         print(f"Processing input file: {infile}")
 
     process_data(infile, popts, args)