From 13d472383982875511001fb0325363c627291696 Mon Sep 17 00:00:00 2001
From: ClementAlba <93247842+ClementAlba@users.noreply.github.com>
Date: Fri, 3 Feb 2023 11:53:53 +0100
Subject: [PATCH] V 2.0.3 (#61)

* Create .readthedocs.yaml

* Removing warning message related to tile_size when input_type = dir
use compute instead of persist

* adding the remove tiles option

* adding a warning if the number of tasks is greater than the number of workers

* change warning message for number of workers

* add compression for the merge option

* add minor_version and dataformat_id for the merge step

* changing imports

* upgrade to 2.0.3
---
 .readthedocs.yaml                             |  29 +++++
 docs/source/conf.py                           |   2 +-
 setup.cfg                                     |   2 +-
 src/pdal_parallelizer/__init__.py             |  92 +++++++++-------
 src/pdal_parallelizer/cloud.py                |  48 ++++++---
 .../pdal_parallelizer_cli/__main__.py         |   6 +-
 src/pdal_parallelizer/tile.py                 | 102 ++++++++++--------
 7 files changed, 183 insertions(+), 98 deletions(-)
 create mode 100644 .readthedocs.yaml

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..1ee7b94
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,29 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+# formats:
+#    - pdf
+
+# Optionally declare the Python requirements required to build your docs
+python:
+   install:
+   - requirements: requirements.txt
diff --git a/docs/source/conf.py b/docs/source/conf.py
index bf71489..fbf7b72 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -7,7 +7,7 @@
 author = 'Clément Alba'
 
 release = '2.0'
-version = '2.0.1'
+version = '2.0.2'
 
 # -- General configuration
 
diff --git a/setup.cfg b/setup.cfg
index 0c5123f..a0dc2f7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = pdal-parallelizer
-version = 2.0.2
+version = 2.0.3
 description = A simple tool for use pdal with parallel execution
 long_description = file: README.rst, CHANGELOG.rst, LICENSE.rst
 keywords = pdal, parallelizer, dask
diff --git a/src/pdal_parallelizer/__init__.py b/src/pdal_parallelizer/__init__.py
index 3732bf0..5830236 100644
--- a/src/pdal_parallelizer/__init__.py
+++ b/src/pdal_parallelizer/__init__.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import os
 import os.path
 import click
@@ -10,9 +11,12 @@
 from . import do
 from . import file_manager
 from . import cloud
+from . import tile
 from matplotlib import pyplot as plt
 import sys
 import ntpath
+from os.path import join
+from math import ceil
 
 
 def query_yes_no(question, default='no'):
@@ -46,7 +50,7 @@ def config_dask(n_workers, threads_per_worker, timeout):
     cfg.set({'interface': 'lo'})
     cfg.set({'distributed.scheduler.worker-ttl': None})
     cfg.set({'distributed.comm.timeouts.connect': timeout})
-    cluster = LocalCluster(n_workers=n_workers, threads_per_worker=threads_per_worker)
+    cluster = LocalCluster(n_workers=n_workers, threads_per_worker=threads_per_worker, silence_logs=logging.ERROR)
     client = Client(cluster)
     return client
 
@@ -63,8 +67,8 @@ def process_pipelines(
         buffer=None,
         remove_buffer=None,
         bounding_box=None,
-        merge_tiles=None,
-        process=False
+        merge_tiles=False,
+        remove_tiles=False
 ):
     # Assertions
     assert type(config) is str
@@ -76,7 +80,7 @@ def process_pipelines(
     if dry_run:
         assert type(dry_run) is int
     assert type(diagnostic) is bool
-    assert type(tile_size) is tuple
+    assert type(tile_size) is tuple and len(tile_size) == 2
     if buffer:
         assert type(buffer) is int
     if remove_buffer:
@@ -84,6 +88,15 @@ def process_pipelines(
     if bounding_box:
         assert type(bounding_box) is tuple
         assert len(bounding_box) == 4
+    assert type(merge_tiles) is bool
+    assert type(remove_tiles) is bool
+
+    with open(config, 'r') as c:
+        config_file = json.load(c)
+        input_dir = config_file.get('input')
+        output = config_file.get('output')
+        temp = config_file.get('temp')
+        pipeline = config_file.get('pipeline')
 
     if n_workers >= os.cpu_count():
         answer = query_yes_no(
@@ -93,20 +106,33 @@ def process_pipelines(
         if not answer:
             return
 
-    if tile_size == (256, 256):
-        answer = query_yes_no(
-            f'WARNING - You are using the default value of the tile_size option (256 by 256 meters). Please '
-            f'check if your points cloud\'s dimensions are greater than this value.\nDo you want to continue ? '
+    if input_type == 'single':
+        if tile_size == (256, 256):
+            answer = query_yes_no(
+                f'WARNING - You are using the default value of the tile_size option (256 by 256 meters). Please '
+                f'check if your points cloud\'s dimensions are greater than this value.\nDo you want to continue ? '
+            )
+            if not answer:
+                return
+
+        infos = cloud.compute_quickinfo(input_dir)
+        bounds = infos['summary']['bounds']
+        distX, distY = (
+            int(bounds['maxx'] - bounds['minx']),
+            int(bounds['maxy'] - bounds['miny'])
         )
-        if not answer:
-            return
 
-    with open(config, 'r') as c:
-        config_file = json.load(c)
-        input_dir = config_file.get('input')
-        output = config_file.get('output')
-        temp = config_file.get('temp')
-        pipeline = config_file.get('pipeline')
+        nTilesX = ceil(distX / tile_size[0])
+        nTilesY = ceil(distY / tile_size[0])
+
+        if nTilesX * nTilesY > n_workers:
+            answer = query_yes_no(
+                f'WARNING - With this size of tiles and this number of workers, each worker will have more than one task'
+                f' and it can blow up the distributed memory. Please choose a larger size for your tiles or increase '
+                f'your number of workers.\nDo you want to continue ?'
+            )
+            if not answer:
+                return
 
     if not os.path.exists(temp):
         os.mkdir(temp)
@@ -162,36 +188,28 @@ def process_pipelines(
     if diagnostic:
         ms = MemorySampler()
         with ms.sample(label='execution', client=client):
-            delayed = client.persist(delayed)
-            progress(delayed) if process else None
-            futures = client.compute(delayed)
-            client.gather(futures)
+            delayed = client.compute(delayed)
+            progress(delayed)
         ms.plot()
     else:
-        delayed = client.persist(delayed)
-        progress(delayed) if process else None
-        futures = client.compute(delayed)
-        client.gather(futures)
+        delayed = client.compute(delayed)
+        progress(delayed)
 
     del delayed
-    del futures
 
     file_manager.getEmptyWeight(output_directory=output)
 
     if merge_tiles and len(listdir(output)) > 1:
-        input_filename = ntpath.basename(input_dir)
-        merge = cloud.merge(output, input_filename)
+        input_filename = ntpath.basename(input_dir).split('.')[0]
+        writers = tile.get_writers(tile.load_pipeline(pipeline))
+        merge = cloud.merge(output, input_filename, writers)
         if merge is not None:
-            merge_ppln = pdal.Pipeline(cloud.merge(output, input_filename))
+            merge_ppln = pdal.Pipeline(merge)
             merge_ppln.execute()
 
-    plt.savefig(output + '/memory-usage.png') if diagnostic else None
+        if remove_tiles:
+            for f in os.listdir(output):
+                if f.split('.')[0] != input_filename:
+                    os.remove(join(output, f))
 
-
-if __name__ == "__main__":
-    process_pipelines(
-        config="D:/data_dev/pdal-parallelizer/config.json",
-        input_type="single",
-        timeout=500,
-        n_workers=3
-    )
+    plt.savefig(output + '/memory-usage.png') if diagnostic else None
diff --git a/src/pdal_parallelizer/cloud.py b/src/pdal_parallelizer/cloud.py
index 71ed7ac..cd27b61 100644
--- a/src/pdal_parallelizer/cloud.py
+++ b/src/pdal_parallelizer/cloud.py
@@ -19,16 +19,35 @@ def crop(bounds):
     return parsed
 
 
-def merge(output_dir, filename):
+def merge(output_dir, filename, writers):
     outputs = ""
+    # Default values according to the pdal writers.las documentation
+    compression = 'none'
+    minor_version = 2
+    dataformat_id = 3
+
     for f in listdir(output_dir):
         if f.split('.').count('png') <= 0:
             outputs += '"' + output_dir + '/' + f + '",'
 
     if outputs != "":
         extension = listdir(output_dir)[0].split('.')[1]
+        if extension == 'laz':
+            writers_extension = 'las'
+            compression = 'laszip'
+        else:
+            writers_extension = extension
+
+        try:
+            minor_version = writers[0]['minor_version']
+            dataformat_id = writers[0]['dataformat_id']
+        except KeyError:
+            pass
+
+        merge = '[' + outputs + '{"type": "writers.' + writers_extension + '", "filename":"' + output_dir + '/' + \
+                filename + '.' + extension + '","extra_dims": "all", "compression": "' + compression + '", ' + \
+                '"minor_version": ' + str(minor_version) + ', "dataformat_id": ' + str(dataformat_id) + '}]'
 
-        merge = '[' + outputs + '{"type": "writers.' + extension + '", "filename":"' + output_dir + '/' + filename + '","extra_dims": "all"}]'
         return merge
 
 
@@ -37,10 +56,21 @@ def addClassFlags():
     return json.loads(ferry)
 
 
+def compute_quickinfo(filepath):
+    """Returns some information about the cloud."""
+    # Get the cloud information
+    pdal_info = subprocess.run(['pdal', 'info', filepath, '--summary'],
+                               stderr=subprocess.PIPE,
+                               stdout=subprocess.PIPE)
+    info = json.loads(pdal_info.stdout.decode())
+
+    return info
+
+
 class Cloud:
     def __init__(self, filepath, bounds=None):
         self.filepath = filepath
-        self.info = self.compute_quickinfo()
+        self.info = compute_quickinfo(self.filepath)
         self.classFlags = self.hasClassFlags()
 
         # Get the cloud information to set its bounds
@@ -65,19 +95,9 @@ def __init__(self, filepath, bounds=None):
     def getCount(self):
         return self.info['summary']['num_points']
 
-    def compute_quickinfo(self):
-        """Returns some information about the cloud."""
-        # Get the cloud information
-        pdal_info = subprocess.run(['pdal', 'info', self.filepath, '--summary'],
-                                   stderr=subprocess.PIPE,
-                                   stdout=subprocess.PIPE)
-        info = json.loads(pdal_info.stdout.decode())
-
-        return info
-
     def hasClassFlags(self):
         """Check if the cloud has the ClassFlags dimension"""
-        info = self.compute_quickinfo()
+        info = compute_quickinfo(self.filepath)
         dimensions = info['summary']['dimensions']
         return 'ClassFlags' in dimensions
 
diff --git a/src/pdal_parallelizer/pdal_parallelizer_cli/__main__.py b/src/pdal_parallelizer/pdal_parallelizer_cli/__main__.py
index 9d86a34..49bc124 100644
--- a/src/pdal_parallelizer/pdal_parallelizer_cli/__main__.py
+++ b/src/pdal_parallelizer/pdal_parallelizer_cli/__main__.py
@@ -9,7 +9,7 @@
 
 
 @click.group()
-@click.version_option('2.0.2')
+@click.version_option('2.0.3')
 def main():
     """A simple parallelization tool for 3d point clouds treatment"""
     pass
@@ -27,6 +27,7 @@ def main():
 @click.option('-rb', '--remove_buffer', is_flag=True, required=False)
 @click.option('-bb', '--bounding_box', required=False, nargs=4, type=float)
 @click.option('-mt', '--merge_tiles', required=False, is_flag=True)
+@click.option('-rt', '--remove_tiles', required=False, is_flag=True)
 def process_pipelines(**kwargs):
     # Get all the options
     config = kwargs.get('config')
@@ -40,6 +41,7 @@ def process_pipelines(**kwargs):
     remove_buffer = kwargs.get('remove_buffer')
     bounding_box = kwargs.get('bounding_box')
     merge_tiles = kwargs.get('merge_tiles')
+    remove_tiles = kwargs.get('remove_tiles')
 
     process(
         config=config,
@@ -53,7 +55,7 @@ def process_pipelines(**kwargs):
         remove_buffer=remove_buffer,
         bounding_box=bounding_box,
         merge_tiles=merge_tiles,
-        process=True
+        remove_tiles=remove_tiles
     )
 
 
diff --git a/src/pdal_parallelizer/tile.py b/src/pdal_parallelizer/tile.py
index 55563cc..9f6d2fe 100644
--- a/src/pdal_parallelizer/tile.py
+++ b/src/pdal_parallelizer/tile.py
@@ -17,6 +17,20 @@
 from . import bounds
 
 
+def load_pipeline(pipeline):
+    with open(pipeline, 'r') as ppln:
+        p = json.load(ppln)
+    return p
+
+
+def get_readers(pipeline):
+    return list(filter(lambda x: x['type'].startswith('readers'), pipeline))
+
+
+def get_writers(pipeline):
+    return list(filter(lambda x: x['type'].startswith('writers'), pipeline))
+
+
 class Tile:
     def __init__(self, filepath, output_dir, json_pipeline, name=None, bounds=None, buffer=None, remove_buffer=False, cloud_object=None):
         self.filepath = filepath
@@ -45,52 +59,54 @@ def pipeline(self, single_file):
         """Assign a pipeline to the tile"""
         output_dir = self.output_dir
 
-        # Open the pipeline
-        with open(self.json_pipeline, 'r') as pipeline:
-            p = json.load(pipeline)
-
-            # Create the name of the temp file associated to the pipeline
-            temp_name = 'temp__' + self.getName()
-            output_filename = f'{output_dir}/{self.getName()}'
-            # Get the reader and the writer
-            reader = list(filter(lambda x: x['type'].startswith('readers'), p))
-            writer = list(filter(lambda x: x['type'].startswith('writers'), p))
+        p = load_pipeline(self.json_pipeline)
+
+        # Create the name of the temp file associated to the pipeline
+        temp_name = 'temp__' + self.getName()
+        output_filename = f'{output_dir}/{self.getName()}'
+        # Get the reader and the writer
+        reader = get_readers(p)
+        writer = get_writers(p)
+        try:
+            compression = writer[0]['compression']
+            extension = '.laz' if compression == 'laszip' or compression == 'lazperf' else '.las'
+        except KeyError:
             # Get the extension for the output
             extension = '.' + writer[0]['type'].split('.')[1] + '.las' if writer[0]['type'].split('.')[1] == 'copc' else '.' + writer[0]['type'].split('.')[1]
 
-            # If there is a buffer
-            if self.buffer:
-                # If the writer is 'writers.copc' or 'writer.laz' or 'writers.copc'
-                if writer[0]['type'].split('.')[1] in ['las', 'laz', 'copc']:
-                    # Insert the filter to assign the wittheld flag to the buffer
-                    p.insert(1, self.assign)
-            if self.remove_buffer:
-                # Remove the buffer
-                p.insert(len(p) - 1, bounds.removeBuffer(self.bounds_without_buffer))
-
-            if self.cloud:
-                # If there is the ferry filter attribute in the tile instance
-                if not self.cloud.classFlags:
-                    # Insert the filter to add the ClassFlags dimension
-                    p.insert(1, cloud.addClassFlags())
-
-            # The pipeline must contain a reader AND a writer
-            if not reader:
-                sys.exit("Please add a reader to your pipeline.")
-            elif not writer:
-                sys.exit("Please add a writer to your pipeline.")
-
-            # If the input is a single file
-            if single_file:
-                # Add a crop filter to divide the cloud in small tiles
-                p.insert(1, cloud.crop(self.bounds))
-
-            # Add the filename option in the pipeline's reader to get the right file
-            reader[0]['filename'] = self.filepath
-            # Add the filename option in the pipeline's writer to write the result in the right file
-            writer[0]['filename'] = output_filename + extension
-
-            p = pdal.Pipeline(json.dumps(p))
+        # If there is a buffer
+        if self.buffer:
+            # If the writer is 'writers.copc' or 'writer.laz' or 'writers.copc'
+            if writer[0]['type'].split('.')[1] in ['las', 'laz', 'copc']:
+                # Insert the filter to assign the wittheld flag to the buffer
+                p.insert(1, self.assign)
+        if self.remove_buffer:
+            # Remove the buffer
+            p.insert(len(p) - 1, bounds.removeBuffer(self.bounds_without_buffer))
+
+        if self.cloud:
+            # If there is the ferry filter attribute in the tile instance
+            if not self.cloud.classFlags:
+                # Insert the filter to add the ClassFlags dimension
+                p.insert(1, cloud.addClassFlags())
+
+        # The pipeline must contain a reader AND a writer
+        if not reader:
+            sys.exit("Please add a reader to your pipeline.")
+        elif not writer:
+            sys.exit("Please add a writer to your pipeline.")
+
+        # If the input is a single file
+        if single_file:
+            # Add a crop filter to divide the cloud in small tiles
+            p.insert(1, cloud.crop(self.bounds))
+
+        # Add the filename option in the pipeline's reader to get the right file
+        reader[0]['filename'] = self.filepath
+        # Add the filename option in the pipeline's writer to write the result in the right file
+        writer[0]['filename'] = output_filename + extension
+
+        p = pdal.Pipeline(json.dumps(p))
 
         return p, temp_name