diff --git a/.github/workflows/test-code-higher_order_syntactic_profiler.yml b/.github/workflows/test-code-higher_order_syntactic_profiler.yml new file mode 100644 index 000000000..7165e9466 --- /dev/null +++ b/.github/workflows/test-code-higher_order_syntactic_profiler.yml @@ -0,0 +1,124 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test - transforms/code/higher_order_syntactic_profiler + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - "transforms/code/higher_order_syntactic_profiler/**" + - "data-processing-lib/**" + - "!transforms/code/higher_order_syntactic_profiler/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - "transforms/code/higher_order_syntactic_profiler/**" + - "data-processing-lib/**" + - "!transforms/code/higher_order_syntactic_profiler/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +jobs: + check_if_push_image: + # check whether the Docker images should be pushed to the remote repository + # The images are pushed if it is a merge to dev branch or a new tag is created. + # The latter being part of the release process. + # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. + runs-on: ubuntu-22.04 + outputs: + publish_images: ${{ steps.version.outputs.publish_images }} + steps: + - id: version + run: | + publish_images='false' + if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/code/higher_order_syntactic_profiler + run: | + if [ -e "transforms/code/higher_order_syntactic_profiler/Makefile" ]; then + make -C transforms/code/higher_order_syntactic_profiler DOCKER=docker test-src + else + echo "transforms/code/higher_order_syntactic_profiler/Makefile not found - source testing disabled for this transform." + fi + test-image: + needs: [check_if_push_image] + runs-on: ubuntu-22.04 + timeout-minutes: 120 + env: + DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform image in transforms/code/higher_order_syntactic_profiler + run: | + if [ -e "transforms/code/higher_order_syntactic_profiler/Makefile" ]; then + if [ -d "transforms/code/higher_order_syntactic_profiler/spark" ]; then + make -C data-processing-lib/spark DOCKER=docker image + fi + make -C transforms/code/higher_order_syntactic_profiler DOCKER=docker test-image + else + echo "transforms/code/higher_order_syntactic_profiler/Makefile not found - testing disabled for this transform." + fi + - name: Print space + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + docker images + - name: Publish images + if: needs.check_if_push_image.outputs.publish_images == 'true' + run: | + if [ -e "transforms/code/higher_order_syntactic_profiler/Makefile" ]; then + make -C transforms/code/higher_order_syntactic_profiler publish + else + echo "transforms/code/higher_order_syntactic_profiler/Makefile not found - publishing disabled for this transform." + fi diff --git a/.github/workflows/test-code-semantic_profiler.yml b/.github/workflows/test-code-semantic_profiler.yml new file mode 100644 index 000000000..6c301ee38 --- /dev/null +++ b/.github/workflows/test-code-semantic_profiler.yml @@ -0,0 +1,124 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test - transforms/code/semantic_profiler + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - "transforms/code/semantic_profiler/**" + - "data-processing-lib/**" + - "!transforms/code/semantic_profiler/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - "transforms/code/semantic_profiler/**" + - "data-processing-lib/**" + - "!transforms/code/semantic_profiler/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +jobs: + check_if_push_image: + # check whether the Docker images should be pushed to the remote repository + # The images are pushed if it is a merge to dev branch or a new tag is created. + # The latter being part of the release process. + # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. + runs-on: ubuntu-22.04 + outputs: + publish_images: ${{ steps.version.outputs.publish_images }} + steps: + - id: version + run: | + publish_images='false' + if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/code/semantic_profiler + run: | + if [ -e "transforms/code/semantic_profiler/Makefile" ]; then + make -C transforms/code/semantic_profiler DOCKER=docker test-src + else + echo "transforms/code/semantic_profiler/Makefile not found - source testing disabled for this transform." + fi + test-image: + needs: [check_if_push_image] + runs-on: ubuntu-22.04 + timeout-minutes: 120 + env: + DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform image in transforms/code/semantic_profiler + run: | + if [ -e "transforms/code/semantic_profiler/Makefile" ]; then + if [ -d "transforms/code/semantic_profiler/spark" ]; then + make -C data-processing-lib/spark DOCKER=docker image + fi + make -C transforms/code/semantic_profiler DOCKER=docker test-image + else + echo "transforms/code/semantic_profiler/Makefile not found - testing disabled for this transform." + fi + - name: Print space + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + docker images + - name: Publish images + if: needs.check_if_push_image.outputs.publish_images == 'true' + run: | + if [ -e "transforms/code/semantic_profiler/Makefile" ]; then + make -C transforms/code/semantic_profiler publish + else + echo "transforms/code/semantic_profiler/Makefile not found - publishing disabled for this transform." + fi diff --git a/.make.versions b/.make.versions index 93e4efcb1..1c737b9ef 100644 --- a/.make.versions +++ b/.make.versions @@ -111,6 +111,11 @@ HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION) DPK_TRANSFORMS_VERSION=$(DPK_VERSION) +HOSP_PYTHON_VERSION=$(DPK_VERSION) +HOSP_RAY_VERSION=$(DPK_VERSION) +SP_PYTHON_VERSION=$(DPK_VERSION) +SP_RAY_VERSION=$(DPK_VERSION) + ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. diff --git a/transforms/code/higher_order_syntactic_profiler/Makefile b/transforms/code/higher_order_syntactic_profiler/Makefile new file mode 100644 index 000000000..e8acb2e36 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/Makefile @@ -0,0 +1,78 @@ +REPOROOT=../../.. +# Use make help, to see the available rules +include $(REPOROOT)/.make.defaults + +setup:: + @# Help: Recursively make $@ all subdirs + $(MAKE) RULE=$@ .recurse + +clean:: + @# Help: Recursively make $@ all subdirs + $(MAKE) RULE=$@ .recurse + +build:: + @# Help: Recursively make $@ in subdirs + $(MAKE) RULE=$@ .recurse +venv:: + @# Help: Recursively make $@ in subdirs + $(MAKE) RULE=$@ .recurse + +image:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +set-versions: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +publish:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test-image:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test-src:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +kind-load-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +docker-load-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +docker-save-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +.PHONY: workflow-venv +workflow-venv: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi + +.PHONY: workflow-test +workflow-test: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + +.PHONY: workflow-upload +workflow-upload: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi + +.PHONY: workflow-build +workflow-build: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi \ No newline at end of file diff --git a/transforms/code/higher_order_syntactic_profiler/README.md b/transforms/code/higher_order_syntactic_profiler/README.md new file mode 100644 index 000000000..512630714 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/README.md @@ -0,0 +1,12 @@ +# SP Transform +The HOSP transform enables the profiling of a given code dataset based on higher order +syntactic and semantic concepts as specified by the user. It also generates a profiling +report in HTML, based on the output table. Per the set of +[transform project conventions](../../README.md#transform-project-conventions) +the following runtimes are available: + +* [python](python/README.md) - provides the base python-based transformation +implementation. +* [ray](ray/README.md) - enables the running of the base python transformation +in a Ray runtime + diff --git a/transforms/code/higher_order_syntactic_profiler/python/.dockerignore b/transforms/code/higher_order_syntactic_profiler/python/.dockerignore new file mode 100644 index 000000000..f7275bbbd --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/.dockerignore @@ -0,0 +1 @@ +venv/ diff --git a/transforms/code/higher_order_syntactic_profiler/python/Dockerfile b/transforms/code/higher_order_syntactic_profiler/python/Dockerfile new file mode 100644 index 000000000..8f444ec15 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/Dockerfile @@ -0,0 +1,41 @@ +FROM docker.io/python:3.10.14-slim-bullseye + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest + +# Create a user and use it to run the transform +RUN useradd -ms /bin/bash dpk +USER dpk +WORKDIR /home/dpk + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . + +# END OF STEPS destined for a data-prep-kit base image + +COPY --chown=dpk:root src/ src/ +COPY --chown=dpk:root pyproject.toml pyproject.toml +RUN pip install --no-cache-dir -e . + +# copy transform main() entry point to the image +COPY ./src/hosp_transform_python.py . + +# copy some of the samples in +COPY ./src/hosp_local.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/dpk + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/code/higher_order_syntactic_profiler/python/Makefile b/transforms/code/higher_order_syntactic_profiler/python/Makefile new file mode 100644 index 000000000..ac9d7d5fa --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/Makefile @@ -0,0 +1,66 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. + +# $(REPOROOT)/.make.versions file contains the versions +include $(REPOROOT)/transforms/.make.transforms + +TRANSFORM_NAME=hosp + + +venv:: .transforms.python-venv + +test:: .transforms.python-test + +clean:: .transforms.clean + +image:: .transforms.python-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-python + +setup:: .transforms.setup + +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(HOSP_PYTHON_VERSION) TOML_VERSION=$(HOSP_PYTHON_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.python-test-image + +# Ensure RUN_ARGS has a default value +RUN_ARGS ?= "" + +run-cli-sample: .transforms.run-cli-python-sample + +run-local-sample: .transforms.run-local-sample + +run-local-python-sample: .transforms.run-local-python-sample + +# run-local-python-sample: +# $(MAKE) RUN_FILE=sp_local_python.py \ +# .transforms.run-local-python-sample + +# RUN_ARGS="--sp_ikb 'Contents' --language 'Language'" \ +#run-s3-ray-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/higher_order_syntactic_profiler/python/README.md b/transforms/code/higher_order_syntactic_profiler/python/README.md new file mode 100644 index 000000000..677c13de1 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/README.md @@ -0,0 +1,70 @@ +# SP Transform +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +This transform implements the higher order concept profiler of a given code dataset. +An user can specify the concepts which are of interest to the downstream usecase. These +concepts can be a complex combination of syntactic and semantic criteria. Based on this, +the input table containing the UBSRs, base syntactic, and semantic concepts is queried to generate +the required results. The current implementation implements a single metric - code-to-comment ratio. +However, this is easily extensible. Examples of other higher order concepts are cyclomatic complexity +of all python samples in the dataset and Line Coverage of all samples in a given semantic category. + +## Configuration and command line Options + +The set of dictionary keys holding [HOSPTransform](src/hosp_transform.py) +configuration for values are as follows: + +* _hosp_metrics_list_ - specifies the list of metrics that the user requires in the profiling report. +The list of metrics has to be predefined and their corresponding implementation logic have to be implemented +apriori in the code. + + +## Running + +### Launched Command Line Options +The following command line arguments are available in addition to +the options provided by +the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). +``` + --hosp_metrics_list HOSP_METRICS_LIST + + List of metrics specified by the user for the profiling report. +``` + +| Parameter | Default | Description | +|------------|----------|--------------| +| `HOSP_METRICS_LIST` | `CCR` | Metrics to be calculated for profiling. Multiple metrics can be entered separated by space. Only valid metric is `CCR` as of now. | + +These correspond to the configuration keys described above. + +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/hosp_transform.py using command line args +* `run-local-sample` - runs src/hosp_local.py + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then + +```shell +ls output +``` +To see results of the transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + diff --git a/transforms/code/higher_order_syntactic_profiler/python/pyproject.toml b/transforms/code/higher_order_syntactic_profiler/python/pyproject.toml new file mode 100644 index 000000000..a996e16cf --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/pyproject.toml @@ -0,0 +1,55 @@ +[project] +name = "dpk_hosp_transform_python" +version = "0.2.1.dev0" +requires-python = ">=3.10" +description = "Higher Order Syntactic Profiler Python Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Aishwariya Chakraborty", email = "aishwariya.chakraborty1@ibm.com" }, +] +dependencies = [ + "data-prep-toolkit==0.2.1.dev0", + "networkx==3.0.0", + "jinja2==3.1.2", + "plotly==5.15.0", + "matplotlib" + +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +'src' = ['template.html'] \ No newline at end of file diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/UAST.py b/transforms/code/higher_order_syntactic_profiler/python/src/UAST.py new file mode 100644 index 000000000..6406a7aee --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/UAST.py @@ -0,0 +1,270 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import json +import networkx +import matplotlib.pyplot as plt + +class UASTNode: + """ + Represents a node in the Universal Abstract Syntax Tree (UAST). + + Attributes: + id (int): The unique identifier of the node. + code_snippet (str): The line(s) of code associated with the node. + node_type (str): The type of the node. + parents (list): The list of parent nodes. + children (list): The list of child nodes. + metadata (dict): The associated information/metadata of the node + start_point (tuple(int, int)): The start line number and byte of the line of the node. + end_point (tuple(int, int)): The end line number and byte of the node. + """ + + def __init__(self, + id: int = 0, + code_snippet: str = None, + node_type: str = None, + parents: list = list(), + children: list = list(), + metadata : dict = dict(), + start_point : tuple[int,int] = (None, None), + end_point : tuple[int,int] = (None, None)) -> None: + + self.id = id + self.code_snippet = code_snippet + self.node_type = node_type + self.parents = parents + self.children = children + self.metadata = metadata + self.start_point = start_point + self.end_point = end_point + + def __str__(self) -> str: + return f"ID: {self.id}, Type: {self.node_type}, Snippet: {repr(self.code_snippet)}, Parents: {self.parents}, Children: {self.children}, Metadata = {self.metadata}" + + def __repr__(self) -> str: + return f"ID: {self.id}, Type: {self.node_type}, Snippet: {repr(self.code_snippet)}, Parents: {self.parents}, Children: {self.children}, Metadata = {self.metadata}" + + def __eq__(self, other) -> bool: + return self.id == other.id and self.code_snippet == other.code_snippet and self.node_type == other.node_type and self.parents == other.parents and self.children == other.children and self.metadata == other.metadata and self.start_point == other.start_point and self.end_point == other.end_point + +class UASTEdge: + """ + Represents an edge in the UAST (Universal Abstract Syntax Tree). + + Attributes: + start_id (int): The ID of the starting node of the edge. + end_id (int): The ID of the ending node of the edge. + directed_relation (str): The directed relation between the nodes. + metadata (dict): The metadata information associated with the edge. + """ + + def __init__(self, + start_id: int = None, + end_id: int = None, + directed_relation: str = None, + metadata : dict = dict()): + + self.start_id = start_id + self.end_id = end_id + self.directed_relation = directed_relation + self.metadata = metadata + + def __str__(self) -> str: + return f"Start: {self.start_id}, End: {self.end_id}, Relation: {self.directed_relation}, Metadata = {self.metadata}, Metadata: {self.metadata}" + + def __repr__(self) -> str: + return f"Start: {self.start_id}, End: {self.end_id}, Relation: {self.directed_relation}, Metadata = {self.metadata}, Metadata: {self.metadata}" + + def __eq__(self, other) -> bool: + return self.start_id == other.start_id and self.end_id == other.end_id and self.directed_relation == other.directed_relation and self.metadata == other.metadata + + def __hash__(self) -> int: + return hash((self.start_id, self.end_id, self.directed_relation, self.metadata)) + +class UAST: + """ + Represents a graph of a Universal Abstract Syntax Tree (UAST). + + Attributes: + nodes (dict[int, UASTNode]): A dictionary mapping node IDs to UASTNode objects. + edges (list[UASTEdge]): A list of UASTEdge objects representing the edges between nodes. + assigned_id (int): The ID to be assigned to the next node added to the UAST. + + Methods: + __init__(): Initializes an empty UAST object. + __len__(): Returns the number of nodes in the UAST. + __str__(): Returns a string representation of the UAST. + __repr__(): Returns a string representation of the UAST. + __eq__(other): Checks if the UAST is equal to another UAST. + add_node(node): Adds a node to the UAST. + _create_root(): Creates a root node for the UAST. + create_node(node_type, code_snippet, start_point, end_point): Creates a new node and adds it to the UAST, also returns the node object. + add_edge(node1, node2, directed_relation, metadata): Adds an edge between two nodes in the UAST. + get_node(id): Retrieves a node from the UAST based on its ID. + get_nodes_of_type(node_type): Retrieves the ID of all nodes of the input type + get_children(node): Retrieves the children of a node in the UAST. + get_parents(node): Retrieves the parent of a node in the UAST. + print_graph(id): Prints the UAST starting from the specified node ID. + save_to_file(file_path): Saves the UAST to a file in JSON format. + load_from_file(file_path): Loads the UAST from a file in JSON format. + visualize(): Visualizes the graph using NetworkX + """ + def __init__(self): + self.nodes : dict[int,UASTNode] = dict() + self.edges : list[UASTEdge] = list() + self.assigned_id : int = 0 + self.nodes_of_type : dict = dict() + self.root = self._create_root() + + def __len__(self) -> int: + return len(self.nodes) + + def __str__(self) -> str: + return f"Nodes: {self.nodes} \nEdges: {self.edges}" + + def __repr__(self) -> str: + return f"Nodes: {self.nodes} \nEdges: {self.edges}" + + def __eq__(self, other) -> bool: + return self.nodes == other.nodes and self.edges == other.edges + + def add_node(self, node : UASTNode) -> None: + self.nodes[self.assigned_id] = node + self.assigned_id += 1 + if node.node_type not in self.nodes_of_type : + self.nodes_of_type[node.node_type] = list() + self.nodes_of_type[node.node_type].append(node.id) + return + + def _create_root(self) -> UASTNode: + return self.create_node(node_type = "uast_root", code_snippet = "root", metadata= {"info" : "links to all"}, start_point = (-1,0), end_point = (-1,3)) + + def create_node(self, + node_type : str = None, + code_snippet : str = None, + metadata : dict = dict(), + start_point : tuple[int,int] = (None, None), + end_point : tuple[int,int] = (None, None)) -> UASTNode: + + node = UASTNode(id = self.assigned_id, node_type = node_type, code_snippet = code_snippet, metadata = metadata, start_point = start_point, end_point = end_point, children= list(), parents = list()) + self.add_node(node) + return node + + def add_edge(self, node1 : UASTNode = None, node2 : UASTNode = None, directed_relation : str = None, metadata : dict = dict())-> UASTEdge: + edge = UASTEdge(start_id = node1.id, end_id = node2.id, directed_relation = directed_relation, metadata = metadata) + node2.parents.append(node1.id) + node1.children.append(node2.id) + self.edges.append(edge) + return edge + + def get_node(self, id : int) -> UASTNode: + return self.nodes[id] + + def get_nodes_of_type(self, node_type : str) -> list[int]: + return self.nodes_of_type[node_type] + + def get_children(self, node : UASTNode) -> list[int]: + return node.children + + def get_parents(self, node : UASTNode) -> int: + return node.parents + + def print_graph(self, id): + if id not in self.nodes: + return + visited = set() + + def dfs(id, visited): + visited.add(id) + print(self.nodes[id]) + for child in self.nodes[id].children: + if child not in visited: + dfs(child, visited) + + dfs(id, visited) + del visited + + + def save_to_file(self, file_path): + # convert children list to list for serialization + copy_nodes = self.nodes.copy() + for k, v in self.nodes.items(): + v.children = list(v.children) + v.parents = list(v.parents) + copy_nodes[k] = v + + + data = { + "nodes": {str(k): v.__dict__ for k, v in self.nodes.items()}, + "edges": [edge.__dict__ for edge in self.edges] + } + + with open(file_path, 'w') as f: + json.dump(data, f, indent= 4) + + return + + def get_json(self): + + copy_nodes = self.nodes.copy() + for k, v in self.nodes.items(): + v.children = list(v.children) + v.parents = list(v.parents) + copy_nodes[k] = v + + data = { + "nodes": {str(k): v.__dict__ for k, v in self.nodes.items()}, + "edges": [edge.__dict__ for edge in self.edges] + } + + return data + + def load_from_json_string(self, obj: str): + data = json.loads(obj) + self.nodes = {int(k): UASTNode(**v) for k, v in data["nodes"].items()} + self.edges = [UASTEdge(**edge) for edge in data["edges"]] + self.assigned_id = max(self.nodes.keys()) + 1 + for node in self.nodes.values(): + node.start_point = tuple(node.start_point) + node.end_point = tuple(node.end_point) + return + + def load_from_file(self, file_path): + with open(file_path, 'r') as f: + data = json.load(f) + self.nodes = {int(k): UASTNode(**v) for k, v in data["nodes"].items()} + self.edges = [UASTEdge(**edge) for edge in data["edges"]] + self.assigned_id = max(self.nodes.keys()) + 1 + for node in self.nodes.values(): + node.start_point = tuple(node.start_point) + node.end_point = tuple(node.end_point) + return + + def visualize(self): + edges_viz = [] + labeldict = {} + for edge in self.edges: + edges_viz.append([edge.start_id, edge.end_id]) + labeldict[edge.start_id] = self.nodes[edge.start_id].node_type + labeldict[edge.end_id] = self.nodes[edge.end_id].node_type + print(labeldict) + plt.figure(figsize=(10,10)) + plt.rcParams["font.size"] = 20 + G = networkx.Graph() + G.add_edges_from(edges_viz) + pos = networkx.spring_layout(G) + networkx.draw_networkx_labels(G, pos, labels= labeldict, font_size= 12, ) + networkx.draw_networkx_nodes(G, pos, nodelist= self.nodes.keys(), node_size= 300) + networkx.draw_networkx_edges(G, pos, edgelist= edges_viz) + plt.show() + return diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/UAST_parser.py b/transforms/code/higher_order_syntactic_profiler/python/src/UAST_parser.py new file mode 100644 index 000000000..6b1fd07bb --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/UAST_parser.py @@ -0,0 +1,254 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from UAST import UAST +import json +from tree_sitter import Tree +import os +""" +Initialize the parser with a path for rules and grammar. +""" +class UASTParser(): + def __init__(self): + self.language : str = None + self.uast : UAST = None + self.rules : dict = None + self.cached_rules = dict() + + # Load UAST Grammar + self.grammar_path = "../../python/src/grammar/UAST_Grammar.json" + if not os.path.exists(self.grammar_path): + print("Current working directory:", os.getcwd()) + raise FileNotFoundError(f"UAST Grammar file not found at {self.grammar_path}. Please ensure it exists.") + + with open(self.grammar_path, "r") as grammar_file: + self.grammar = json.load(grammar_file) + + # Rule directory and file + self.rule_directory = "../../python/src/ruleset/" + if not os.path.isdir(self.rule_directory): + print("Current working directory:", os.getcwd()) + raise FileNotFoundError(f"Ruleset directory not found at {self.rule_directory}. Please ensure it exists.") + + self.rule_file_name: str = "UAST_rules_" + + self.AST : Tree = None + # self.offset : int = None + # self.prev_line : int = -1 + self.extracted : str = None + self.function_info = dict() + self.class_info = dict() + self.user_defined_entity = {"uast_function": "self.function_info[snippet] = id", + "uast_class": "self.class_info[snippet] = id"} + + + def set_rule_dir_path(self, path: str): + self.rule_directory = path + + def set_grammar_path(self, path : str): + self.grammar_path = path + self.grammar = json.load(open(self.grammar_path, "r")) + + # set language for the parser + def set_language(self, language : str): + self.language = language + + if (language not in self.cached_rules): + rules_cache = json.load(open(self.rule_directory + self.rule_file_name + self.language + '.json', "r")) + self.cached_rules[language] = rules_cache + + self.rules = self.cached_rules[language] + + # initialise a DFS traversal on the AST and an empty UAST. + def parse(self, AST, code_snippet) : + if(self.language == None) : + print("Language not loaded") + return + self.AST = AST + self.uast = UAST() + self.uast.root.metadata["language"] = self.language + self.uast.root.metadata["loc_snippet"] = self.count_loc(code_snippet, self.language) + self._dfs(AST_node = self.AST.root_node, parent = self.uast.root) + ''' + # commenting this block temporarily + # Call the new modularized function to calculate the code-to-comment ratio + code_to_comment_ratio = self.calculate_code_to_comment_ratio(self.uast.root) + # Add the code_to_comment_ratio to the root node's metadata + self.uast.root.metadata["code_to_comment_ratio"] = code_to_comment_ratio + ''' + return self.uast + + def calculate_code_to_comment_ratio(self, root_node): + # Get the loc_snippet from the root node's metadata + loc_snippet = root_node.metadata.get("loc_snippet", 0) + + # Sum all loc_original_code for uast_comment nodes + total_comment_loc = 0 + + # Recursive function to sum comment LOC + def sum_comment_loc(node): + nonlocal total_comment_loc + + # Check if the node is a comment node + if node.node_type == "uast_comment": + total_comment_loc += node.metadata.get("loc_original_code", 0) + + # Traverse the children, ensuring we get the actual node objects + for child_id in node.children: + child_node = self.uast.get_node(child_id) # Fetch the actual child node using self.uast + sum_comment_loc(child_node) # Recursively sum for the child node + + # Start summing loc_original_code from the root node + sum_comment_loc(root_node) + + # Calculate the code-to-comment ratio (handling division by zero) + if total_comment_loc > 0: + return loc_snippet / total_comment_loc + else: + return None # Handle no comments + + def count_lo_comments(self, code_snippet): + lines = code_snippet.split('\n') + loc_count = 0 + for line in lines: + stripped_line = line.strip() + # Count all lines except blank ones + if stripped_line: + loc_count += 1 + return loc_count + + def count_loc(self, code_snippet, language): + # Define the comment markers for each language + language_comment_markers = { + "c": ('//', '/*', '*/'), + "java": ('//', '/*', '*/'), + "C#": ('//', '/*', '*/'), + "c_sharp": ('//', '/*', '*/'), + "cpp": ('//', '/*', '*/'), + "objc": ('//', '/*', '*/'), + "rust": ('//', '/*', '*/'), + "go": ('//', '/*', '*/'), + "kotlin": ('//', '/*', '*/'), + "VHDL": ('--', None, None), + "py": ('#', '"""', '"""'), + "js": ('//', '/*', '*/'), + "dart": ('//', '/*', '*/'), + "QML": ('//', None, None), + "typescript": ('//', '/*', '*/'), + "perl": ('#', None, None), + "haskell": ('--', '{-', '-}'), + "elm": ('--', '{-', '-}'), + "agda": ('--', '{-', '-}'), + "d": ('//', '/*', '*/'), + "nim": ('#', '##', None), + "ocaml": ('(*', '(*', '*)'), + "scala": ('//', '/*', '*/') + } + + single_line_comment, multi_line_comment_start, multi_line_comment_end = language_comment_markers.get(language, (None, None, None)) + + if not single_line_comment: + raise ValueError(f"Unsupported language: {language}") + + lines = code_snippet.split('\n') + loc_count = 0 + inside_multiline_comment = False + + for line in lines: + stripped_line = line.strip() + + # Skip empty lines + if not stripped_line: + continue + + # Handle multi-line comments + if multi_line_comment_start and multi_line_comment_end: + if inside_multiline_comment: + # Check if the line contains the end of a multi-line comment + if multi_line_comment_end in stripped_line: + inside_multiline_comment = False + continue + elif multi_line_comment_start in stripped_line: + # If the line starts a multi-line comment + inside_multiline_comment = True + continue + + # Skip single-line comments + if stripped_line.startswith(single_line_comment): + continue + + # If the line is neither a comment nor blank, count it as LOC + loc_count += 1 + + return loc_count + + def _add_user_defined(self, node): + id = node.id + type = node.node_type + + if node.code_snippet is not None: + snippet = node.code_snippet.replace(type, '').strip() + # Add further processing with the snippet + else: + # Handle the case where code_snippet is None + snippet = "" + # You can log a warning or take other appropriate action + print(f"Warning: node.code_snippet is None for node type: {type}") + + if (type in self.user_defined_entity): + exec(self.user_defined_entity[type]) + node.metadata["user_defined"] = True + + del id + del type + del snippet + return + + # Traversing through the AST to create nodes recursively. + def _dfs(self, AST_node, parent) : + if (AST_node.type in self.rules) : + ast_snippet = AST_node.text.decode("utf8") + node_type = self.rules[AST_node.type]["uast_node_type"] + exec_string = self.rules[AST_node.type]["extractor"] + uast_snippet = self._extract(ast_snippet = ast_snippet, node_type = node_type, exec_string = exec_string) + + if node_type == "uast_comment": + loc_original_code = self.count_lo_comments(ast_snippet) + else: + loc_original_code = self.count_loc(ast_snippet, self.language) + + node = self.uast.create_node( + node_type = node_type, + code_snippet = uast_snippet, + # choose to enable or disbale the storage of original code by removing the following line. + metadata = { + "original_code" : ast_snippet, + "loc_original_code": loc_original_code + }, + ) + self._add_user_defined(node) + self.uast.add_edge(node1 = parent, node2 = node, directed_relation = "parent_node") + parent = node + + for child in AST_node.children: + self._dfs(AST_node= child, parent = parent) + + def _extract(self, ast_snippet, node_type, exec_string): + code_snippet = ast_snippet + try: + exec(exec_string) + except Exception as e: + print(e) + try: + return self.grammar[node_type]["keyword"] + " " + self.extracted + except Exception as e: + print(e) \ No newline at end of file diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local.py b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local.py new file mode 100644 index 000000000..4d66f5bd7 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local.py @@ -0,0 +1,35 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.data_access import DataAccessLocal +from hosp_transform import HigherOrderSyntacticProfilerTransform + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) + +hosp_params = {"metrics_list": ["CCR"]} + +if __name__ == "__main__": + # Here we show how to run outside of the runtime + # Create and configure the transform. + transform = HigherOrderSyntacticProfilerTransform(hosp_params) + # Use the local data access to read a parquet table. + data_access = DataAccessLocal() + table, _ = data_access.get_table(os.path.join(input_folder, "test.parquet")) + print(f"input table: {table}") + # Transform the table + table_list, metadata = transform.transform(table) + print(f"\noutput table: {table_list}") + print(f"output metadata : {metadata}") diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local_python.py b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local_python.py new file mode 100644 index 000000000..f6981e99b --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local_python.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from hosp_transform_python import HigherOrderSyntacticProfilerPythonTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # hosp params + "hosp_metrics_list": ["CCR"] + +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=HigherOrderSyntacticProfilerPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local_python_multiprocessor.py b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local_python_multiprocessor.py new file mode 100644 index 000000000..f5861ce13 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_local_python_multiprocessor.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from hosp_transform_python import HigherOrderSyntacticProfilerPythonTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # "runtime_num_processors": 2, + # hosp params + "hosp_metrics_list": ["CCR"] +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=HigherOrderSyntacticProfilerPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/hosp_transform.py b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_transform.py new file mode 100644 index 000000000..aa5dad240 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_transform.py @@ -0,0 +1,167 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time +from argparse import ArgumentParser, Namespace +from typing import Any +import csv +from pathlib import Path + +import pyarrow as pa +import pyarrow.parquet as pq +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider +from UAST import * +from report import * + + +short_name = "hosp" +cli_prefix = f"{short_name}_" + +metrics_list = "metrics_list" +hosp_metrics_cli_param = f"{cli_prefix}{metrics_list}" + +base_constructs = ['Library', 'Language', 'Concepts'] + + +def uast_read(jsonstring): + """ + Reads an input json string into UAST class object + """ + uast = UAST() + if jsonstring is not None and jsonstring != 'null': + uast.load_from_json_string(jsonstring) + return uast + return None + +def extract_ccr(uast): + """ + Calculates the code to comment ratio given an UAST object as input + """ + if uast is not None: + total_comment_loc = 0 + for node_idx in uast.nodes: + node = uast.get_node(node_idx) + if node.node_type == 'uast_comment': + total_comment_loc += node.metadata.get("loc_original_code", 0) + elif node.node_type == 'uast_root': + loc_snippet = node.metadata.get("loc_snippet", 0) + if total_comment_loc > 0: + return loc_snippet / total_comment_loc + else: + return None + return None + +def generate_report(table: pa.Table, metrics_list): + """ + Generates the profiler report given the table name and the metrics list given as input by the user. + """ + columns = base_constructs + metrics_list + script_dir = Path(__file__).parent.resolve() + template_file = str(script_dir / 'template.html') + output_file = str(script_dir / 'output.html') + report = Report(template_file) + count = 0 + for column in columns: + plot = Plot(table, column) + plot_html = plot.generate_distribution_plot() + report.add_metric(count, column, plot_html) + count+=1 + report.save(output_file) + + + +class HigherOrderSyntacticProfilerTransform(AbstractTableTransform): + """ + Implements a simple copy of a pyarrow Table. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, HigherOrderSyntacticProfilerTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + # Make sure that the param name corresponds to the name used in apply_input_params method + # of HigherOrderSyntacticProfilerTransformConfiguration class + super().__init__(config) + self.metrics_list = config.get("metrics", ["CCR"]) + + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + Put Transform-specific to convert one Table to 0 or more tables. It also returns + a dictionary of execution statistics - arbitrary dictionary + This implementation makes no modifications so effectively implements a copy of the + input parquet to the output folder, without modification. + """ + self.logger.debug(f"Transforming one table with {len(table)} rows") + if self.metrics_list is not None: + for metric in self.metrics_list: + if metric == "CCR": + self.logger.info(f"Generating {metric} values") + uasts = [uast_read(uast_json) for uast_json in table['UAST'].to_pylist()] + ccrs = [extract_ccr(uast) for uast in uasts] + new_table = table.append_column(metric, pa.array(ccrs)) + if 'UAST' in new_table.schema.names and 'Concepts' in new_table.schema.names: + generate_report(new_table,self.metrics_list) + self.logger.debug(f"Transformed one table with {len(new_table)} rows") + metadata = {"nfiles": 1, "nrows": len(new_table)} + return [new_table], metadata + + +class HigherOrderSyntacticProfilerTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=HigherOrderSyntacticProfilerTransform, + # remove_from_metadata=[pwd_key], + ) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__) + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the HigherOrderSyntacticProfilerTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, sp_, pii_, etc.) + """ + + # Add argument for a list of strings + parser.add_argument( + f"--{hosp_metrics_cli_param}", + type=str, + nargs='+', # Accept one or more strings + default=["CCR"], # Set a default value as a list + help="List of higher order syntactic profiling metrics (default: ['CCR'])", + ) + + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"hosp parameters are : {self.params}") + return True diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/hosp_transform_python.py b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_transform_python.py new file mode 100644 index 000000000..1c419765c --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/hosp_transform_python.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from hosp_transform import HigherOrderSyntacticProfilerTransformConfiguration + + +logger = get_logger(__name__) + + +class HigherOrderSyntacticProfilerPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for HigherOrderSyntacticProfiler as required by the PythonTransformLauncher. + HigherOrderSyntacticProfiler does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=HigherOrderSyntacticProfilerTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = HigherOrderSyntacticProfilerRayLauncher() + launcher = PythonTransformLauncher(HigherOrderSyntacticProfilerPythonTransformConfiguration()) + logger.info("Launching hosp transform") + launcher.launch() diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/output.html b/transforms/code/higher_order_syntactic_profiler/python/src/output.html new file mode 100644 index 000000000..630383180 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/output.html @@ -0,0 +1,169 @@ + + + + + + Profiler Report + + + + + + +
+
+

Syntactic and Semantic Profile

+

This report presents the detailed profiling report of the input dataset.

+
+ +
+ +
+

Available Metrics

+ +
+ + +
+
+ + + + + + + + + +
+
+
+
+
+ + + + + \ No newline at end of file diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/report.py b/transforms/code/higher_order_syntactic_profiler/python/src/report.py new file mode 100644 index 000000000..b950f9d0c --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/report.py @@ -0,0 +1,101 @@ +import os +import numpy as np +from jinja2 import Environment, FileSystemLoader +import pyarrow as pa +import plotly.graph_objects as go +from plotly.io import to_html +from pathlib import Path + +class Plot: + ''' + Plot class implements the generation of frequency distribution plots of the various components of the profiler report. + Given a pyarrow table and a column name, it generates the corresponding plot. + ''' + def __init__(self, table, column_name): + self.table = table + self.column_name = column_name + self.column_data = self._get_column_data() + + def _get_column_data(self): + column_data = self.table[self.column_name].to_numpy() + split_data = [] + for value in column_data: + if isinstance(value, str) and ',' in value: + split_data.extend(value.split(',')) + else: + split_data.append(value) + return np.array([item.strip() if isinstance(item, str) else item for item in split_data]) + + def generate_distribution_plot(self): + data = self.column_data + fig = go.Figure() + fig.add_trace(go.Histogram(x=data, nbinsx=len(np.unique(data)), opacity=0.7, marker=dict(color='blue', line=dict(width=1, color='black')))) + fig.update_layout( + width=500, + height=300, + title=dict( + text=f'Distribution of {self.column_name}', + font=dict(size=14) + ), + xaxis=dict( + title='Value', + title_font=dict(size=12), + tickfont=dict(size=10) + ), + yaxis=dict( + title='Frequency', + title_font=dict(size=12), + tickfont=dict(size=10) + ), + bargap=0.1 + ) + return to_html(fig, full_html=False) + + +class Report: + ''' + Generates the report containing the distribution of various syntactic and semantic components. + ''' + def __init__(self, template_file: str): + path = Path(template_file) + directory = path.parent + file_name = path.name + self.env = Environment(loader=FileSystemLoader(directory)) + self.template = self.env.get_template(file_name) + self.data = {} + self.data['title'] = 'Profiler Report' + self.data['heading'] = 'Syntactic and Semantic Profile' + self.data['description'] = 'This report presents the detailed profiling report of the input dataset.' + + def add_metric(self, metric_id, name, graph_html=None): + if 'metrics' not in self.data: + self.data['metrics'] = [] + self.data['metrics'].append({ + 'id': metric_id, + 'name': name, + 'graph_html': graph_html + }) + + def render(self): + return self.template.render(self.data) + + def save(self, output_file): + output = self.render() + with open(output_file, 'w') as f: + f.write(output) + print(f"HTML file generated: {output_file}") + + + + +# # Usage example +# if __name__ == "__main__": +# data = { +# 'column1': [1, 2, 2, 3, 3, 3, 4, 4, 4, 4] +# } +# table = pa.table(data) +# plot = Plot(table, 'column1') +# plot_html = plot.generate_distribution_plot() +# report = Report('template.html') +# report.add_metric('metric1', 'Metric 1', 'Details about Metric 1.', plot_html) +# report.save('output.html') \ No newline at end of file diff --git a/transforms/code/higher_order_syntactic_profiler/python/src/template.html b/transforms/code/higher_order_syntactic_profiler/python/src/template.html new file mode 100644 index 000000000..0b94cd2ce --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/src/template.html @@ -0,0 +1,107 @@ + + + + + + {{ title }} + + + + + + +
+
+

{{ heading }}

+

{{ description }}

+
+ +
+ +
+

Available Metrics

+ +
+ + +
+
+ {% for metric in metrics %} + + {% endfor %} +
+
+
+
+
+ + + + + diff --git a/transforms/code/higher_order_syntactic_profiler/python/test-data/expected/metadata.json b/transforms/code/higher_order_syntactic_profiler/python/test-data/expected/metadata.json new file mode 100644 index 000000000..42bb81a07 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/test-data/expected/metadata.json @@ -0,0 +1,46 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "HigherOrderSyntacticProfiler", + "job type": "ray", + "job id": "job_id", + "start_time": "2024-03-01 15:17:56", + "end_time": "2024-03-01 15:17:57", + "status": "success" + }, + "code": [null], + "job_input_params": { + "sleep": 0, + "checkpointing": false, + "max_files": -1, + "number of workers": 1, + "worker options": { + "num_cpus": 0.8 + }, + "actor creation delay": 0 + }, + "execution_stats": { + "cpus": 10, + "gpus": 0, + "memory": 14.031964112073183, + "object_store": 2.0 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 16534, + "result_files": 1, + "result_size": 16534, + "table_processing": 0.012392997741699219, + "nfiles": 1, + "nrows": 5 + }, + "source": { + "name": "test-data/data_processing/ray/hosp/input", + "type": "path" + }, + "target": { + "name": "/tmp/HOSP4o9gv2bq", + "type": "path" + } +} diff --git a/transforms/code/higher_order_syntactic_profiler/python/test-data/expected/test.parquet b/transforms/code/higher_order_syntactic_profiler/python/test-data/expected/test.parquet new file mode 100644 index 000000000..e0dda5bf6 Binary files /dev/null and b/transforms/code/higher_order_syntactic_profiler/python/test-data/expected/test.parquet differ diff --git a/transforms/code/higher_order_syntactic_profiler/python/test-data/input/test.parquet b/transforms/code/higher_order_syntactic_profiler/python/test-data/input/test.parquet new file mode 100644 index 000000000..7a4080d86 Binary files /dev/null and b/transforms/code/higher_order_syntactic_profiler/python/test-data/input/test.parquet differ diff --git a/transforms/code/higher_order_syntactic_profiler/python/test/test_hosp.py b/transforms/code/higher_order_syntactic_profiler/python/test/test_hosp.py new file mode 100644 index 000000000..9ef6dfa79 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/test/test_hosp.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os + +import pyarrow as pa +from data_processing.test_support import get_tables_in_folder +from data_processing.test_support.transform.table_transform_test import ( + AbstractTableTransformTest, +) +from hosp_transform import HigherOrderSyntacticProfilerTransform, metrics_list + + +# table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) +# expected_table = table # We're a sp after all. +# expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result + + +class TestHigherOrderSyntacticProfilerTransform(AbstractTableTransformTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + input_tables = get_tables_in_folder(input_dir) + expected_tables = get_tables_in_folder(expected_dir) + + expected_metadata_list = [{"nfiles": 1, "nrows": len(expected_tables[0])}, {}] + config = {metrics_list: ["CCR"]} + fixtures = [ + (HigherOrderSyntacticProfilerTransform(config), input_tables, expected_tables, expected_metadata_list), + ] + return fixtures diff --git a/transforms/code/higher_order_syntactic_profiler/python/test/test_hosp_python.py b/transforms/code/higher_order_syntactic_profiler/python/test/test_hosp_python.py new file mode 100644 index 000000000..9554d5798 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/python/test/test_hosp_python.py @@ -0,0 +1,48 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from hosp_transform import hosp_metrics_cli_param +from hosp_transform_python import HigherOrderSyntacticProfilerPythonTransformConfiguration + + +class TestPythonSemanticProfilerTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + + launcher = PythonTransformLauncher(HigherOrderSyntacticProfilerPythonTransformConfiguration()) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + + transform_config = {hosp_metrics_cli_param: ["CCR"]} + fixtures.append( + ( + launcher, + transform_config, + input_dir, + expected_dir, + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + + return fixtures diff --git a/transforms/code/higher_order_syntactic_profiler/ray/.dockerignore b/transforms/code/higher_order_syntactic_profiler/ray/.dockerignore new file mode 100644 index 000000000..f7275bbbd --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/.dockerignore @@ -0,0 +1 @@ +venv/ diff --git a/transforms/code/higher_order_syntactic_profiler/ray/.gitignore b/transforms/code/higher_order_syntactic_profiler/ray/.gitignore new file mode 100644 index 000000000..3ea7fd4ab --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/.gitignore @@ -0,0 +1,38 @@ +test-data/output +output/* +/output/ +data-processing-lib/ + + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + + +# Distribution / packaging +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +htmlcov +.coverage +.cache +nosetests.xml +coverage.xml \ No newline at end of file diff --git a/transforms/code/higher_order_syntactic_profiler/ray/Dockerfile b/transforms/code/higher_order_syntactic_profiler/ray/Dockerfile new file mode 100644 index 000000000..fcc3b3cbb --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/Dockerfile @@ -0,0 +1,42 @@ +ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 +FROM ${BASE_IMAGE} + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . +COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ +RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . +COPY --chown=ray:users python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . + +#COPY requirements.txt requirements.txt +#RUN pip install --no-cache-dir -r requirements.txt + +COPY --chown=ray:users src/ src/ +COPY --chown=ray:users pyproject.toml pyproject.toml +RUN pip install --no-cache-dir -e . + +# copy the main() entry point to the image +COPY ./src/hosp_transform_ray.py . + +# copy some of the samples in +COPY ./src/hosp_local_ray.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/ray + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/code/higher_order_syntactic_profiler/ray/Makefile b/transforms/code/higher_order_syntactic_profiler/ray/Makefile new file mode 100644 index 000000000..4bc8cd28c --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/Makefile @@ -0,0 +1,58 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. + +include $(REPOROOT)/transforms/.make.transforms + +TRANSFORM_NAME=hosp + +BASE_IMAGE=${RAY_BASE_IMAGE} +venv:: .transforms.ray-venv + +test:: .transforms.ray-test + +clean:: .transforms.clean + +image:: .transforms.ray-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +test-image:: .transforms.ray-test-image + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-ray + +setup:: .transforms.setup + +# set the version of python transform that this depends on. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=${HOSP_PYTHON_VERSION} TOML_VERSION=$(HOSP_RAY_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +# Ensure RUN_ARGS has a default value +RUN_ARGS ?= "" + +run-cli-sample: .transforms.run-cli-ray-sample + +run-local-sample: .transforms.run-local-ray-sample + +# run-s3-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/higher_order_syntactic_profiler/ray/README.md b/transforms/code/higher_order_syntactic_profiler/ray/README.md new file mode 100644 index 000000000..038c830f9 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/README.md @@ -0,0 +1,46 @@ +# SP Ray Transform +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +This project wraps the [hosp transform](../python) with a Ray runtime. + +## Configuration and command line Options + +HOSP configuration and command line options are the same as for the [base python](../python) transform. + +## Running + +### Launched Command Line Options +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. + +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/hosp_transform.py using command line args +* `run-local-sample` - runs src/hosp_local_ray.py + + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/higher_order_syntactic_profiler/ray/pyproject.toml b/transforms/code/higher_order_syntactic_profiler/ray/pyproject.toml new file mode 100644 index 000000000..96a5bb28a --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/pyproject.toml @@ -0,0 +1,45 @@ +[project] +name = "dpk_hosp_transform_ray" +version = "0.2.1.dev0" +requires-python = ">=3.10" +description = "HOSP Ray Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Aishwariya Chakraborty", email = "aishwariya.chakraborty1@ibm.com" }, +] +dependencies = [ + "dpk-hosp-transform-python==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev0", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/code/higher_order_syntactic_profiler/ray/src/hosp_local_ray.py b/transforms/code/higher_order_syntactic_profiler/ray/src/hosp_local_ray.py new file mode 100644 index 000000000..d639aa0fc --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/src/hosp_local_ray.py @@ -0,0 +1,51 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from hosp_transform_ray import HigherOrderSyntacticProfilerRayTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # hosp params + "hosp_metrics_list": ["CCR"] +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(HigherOrderSyntacticProfilerRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/code/higher_order_syntactic_profiler/ray/src/hosp_transform_ray.py b/transforms/code/higher_order_syntactic_profiler/ray/src/hosp_transform_ray.py new file mode 100644 index 000000000..fabf28e8b --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/src/hosp_transform_ray.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from hosp_transform import HigherOrderSyntacticProfilerTransformConfiguration + + +logger = get_logger(__name__) + + +class HigherOrderSyntacticProfilerRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for HOSP as required by the RayTransformLauncher. + HOSP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=HigherOrderSyntacticProfilerTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = HOSPRayLauncher() + launcher = RayTransformLauncher(HigherOrderSyntacticProfilerRayTransformConfiguration()) + logger.info("Launching hosp transform") + launcher.launch() diff --git a/transforms/code/higher_order_syntactic_profiler/ray/test-data/expected/metadata.json b/transforms/code/higher_order_syntactic_profiler/ray/test-data/expected/metadata.json new file mode 100644 index 000000000..42bb81a07 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/test-data/expected/metadata.json @@ -0,0 +1,46 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "HigherOrderSyntacticProfiler", + "job type": "ray", + "job id": "job_id", + "start_time": "2024-03-01 15:17:56", + "end_time": "2024-03-01 15:17:57", + "status": "success" + }, + "code": [null], + "job_input_params": { + "sleep": 0, + "checkpointing": false, + "max_files": -1, + "number of workers": 1, + "worker options": { + "num_cpus": 0.8 + }, + "actor creation delay": 0 + }, + "execution_stats": { + "cpus": 10, + "gpus": 0, + "memory": 14.031964112073183, + "object_store": 2.0 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 16534, + "result_files": 1, + "result_size": 16534, + "table_processing": 0.012392997741699219, + "nfiles": 1, + "nrows": 5 + }, + "source": { + "name": "test-data/data_processing/ray/hosp/input", + "type": "path" + }, + "target": { + "name": "/tmp/HOSP4o9gv2bq", + "type": "path" + } +} diff --git a/transforms/code/higher_order_syntactic_profiler/ray/test-data/expected/test.parquet b/transforms/code/higher_order_syntactic_profiler/ray/test-data/expected/test.parquet new file mode 100644 index 000000000..8f89f008a Binary files /dev/null and b/transforms/code/higher_order_syntactic_profiler/ray/test-data/expected/test.parquet differ diff --git a/transforms/code/higher_order_syntactic_profiler/ray/test-data/input/test.parquet b/transforms/code/higher_order_syntactic_profiler/ray/test-data/input/test.parquet new file mode 100644 index 000000000..f9ac1f024 Binary files /dev/null and b/transforms/code/higher_order_syntactic_profiler/ray/test-data/input/test.parquet differ diff --git a/transforms/code/higher_order_syntactic_profiler/ray/test/test_hosp_ray.py b/transforms/code/higher_order_syntactic_profiler/ray/test/test_hosp_ray.py new file mode 100644 index 000000000..16defe6f3 --- /dev/null +++ b/transforms/code/higher_order_syntactic_profiler/ray/test/test_hosp_ray.py @@ -0,0 +1,47 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from hosp_transform import hosp_metrics_cli_param +from hosp_transform_ray import HigherOrderSyntacticProfilerRayTransformConfiguration + + +class TestRayHigherOrderSyntacticProfilerTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + + launcher = RayTransformLauncher(HigherOrderSyntacticProfilerRayTransformConfiguration()) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + runtime_config = {"run_locally": True} + transform_config = {hosp_metrics_cli_param: ["CCR"]} + fixtures.append( + ( + launcher, + transform_config | runtime_config, + input_dir, + expected_dir, + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + return fixtures diff --git a/transforms/code/semantic_profiler/Makefile b/transforms/code/semantic_profiler/Makefile new file mode 100644 index 000000000..a98281e4d --- /dev/null +++ b/transforms/code/semantic_profiler/Makefile @@ -0,0 +1,78 @@ +REPOROOT=../../.. +# Use make help, to see the available rules +include $(REPOROOT)/.make.defaults + +setup:: + @# Help: Recursively make $@ all subdirs + $(MAKE) RULE=$@ .recurse + +clean:: + @# Help: Recursively make $@ all subdirs + $(MAKE) RULE=$@ .recurse + +build:: + @# Help: Recursively make $@ in subdirs + $(MAKE) RULE=$@ .recurse +venv:: + @# Help: Recursively make $@ in subdirs + $(MAKE) RULE=$@ .recurse + +image:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +set-versions: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +publish:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test-image:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test-src:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +kind-load-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +docker-load-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +docker-save-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +.PHONY: workflow-venv +workflow-venv: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi + +.PHONY: workflow-test +workflow-test: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + +.PHONY: workflow-upload +workflow-upload: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi + +.PHONY: workflow-build +workflow-build: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi diff --git a/transforms/code/semantic_profiler/README.md b/transforms/code/semantic_profiler/README.md new file mode 100644 index 000000000..9090a9b20 --- /dev/null +++ b/transforms/code/semantic_profiler/README.md @@ -0,0 +1,12 @@ +# SP Transform +The SP transform performs the semantic profiling of code snippets in a dataset. This +is done based on the libraries and their categorization obtained from an Internal +Knowledge Base (IKB) which is generated offline using LLMs.Per the set of +[transform project conventions](../../README.md#transform-project-conventions) +the following runtimes are available: + +* [python](python/README.md) - provides the base python-based transformation +implementation. +* [ray](ray/README.md) - enables the running of the base python transformation +in a Ray runtime + diff --git a/transforms/code/semantic_profiler/python/.dockerignore b/transforms/code/semantic_profiler/python/.dockerignore new file mode 100644 index 000000000..f7275bbbd --- /dev/null +++ b/transforms/code/semantic_profiler/python/.dockerignore @@ -0,0 +1 @@ +venv/ diff --git a/transforms/code/semantic_profiler/python/Dockerfile b/transforms/code/semantic_profiler/python/Dockerfile new file mode 100644 index 000000000..65760f938 --- /dev/null +++ b/transforms/code/semantic_profiler/python/Dockerfile @@ -0,0 +1,41 @@ +FROM docker.io/python:3.10.14-slim-bullseye + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest + +# Create a user and use it to run the transform +RUN useradd -ms /bin/bash dpk +USER dpk +WORKDIR /home/dpk + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . + +# END OF STEPS destined for a data-prep-kit base image + +COPY --chown=dpk:root src/ src/ +COPY --chown=dpk:root pyproject.toml pyproject.toml +RUN pip install --no-cache-dir -e . + +# copy transform main() entry point to the image +COPY ./src/sp_transform_python.py . + +# copy some of the samples in +COPY ./src/sp_local.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/dpk + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/code/semantic_profiler/python/Makefile b/transforms/code/semantic_profiler/python/Makefile new file mode 100644 index 000000000..ea01570e5 --- /dev/null +++ b/transforms/code/semantic_profiler/python/Makefile @@ -0,0 +1,66 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. + +# $(REPOROOT)/.make.versions file contains the versions +include $(REPOROOT)/transforms/.make.transforms + +TRANSFORM_NAME=sp + + +venv:: .transforms.python-venv + +test:: .transforms.python-test + +clean:: .transforms.clean + +image:: .transforms.python-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-python + +setup:: .transforms.setup + +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(SP_PYTHON_VERSION) TOML_VERSION=$(SP_PYTHON_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.python-test-image + +# Ensure RUN_ARGS has a default value +RUN_ARGS ?= "" + +run-cli-sample: .transforms.run-cli-python-sample + +run-local-sample: .transforms.run-local-sample + +run-local-python-sample: .transforms.run-local-python-sample + +# run-local-python-sample: +# $(MAKE) RUN_FILE=sp_local_python.py \ +# .transforms.run-local-python-sample + +# RUN_ARGS="--sp_ikb 'Contents' --language 'Language'" \ +#run-s3-ray-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/semantic_profiler/python/README.md b/transforms/code/semantic_profiler/python/README.md new file mode 100644 index 000000000..5d9bd3c7d --- /dev/null +++ b/transforms/code/semantic_profiler/python/README.md @@ -0,0 +1,73 @@ +# SP Transform +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +This transform implements semantic profiling of a code dataset. Given an input dataset +as a pyarrow table with the UBSRs of code data points, this transform extracts the libraries +and obtains their semantic mapping by consulting the IKB. The semantic concepts obatined per data +point are then added as a new column in the input dataset. Those libraries which are not present in the +IKB are recorded in a separate 'null_libs' file for offline processing. This file is passed as an input +to the [offline path](src/offline_path/) which reads the libraries and obtains their semantic categories +from predefined set by prompting an LLM. The examples passed into the prompt are present in the [examples folder](src/examples/) + +## Configuration and command line Options + +The set of dictionary keys holding [SPTransform](src/sp_transform.py) +configuration for values are as follows: + +* _sp_ikb_file_ - This is the path to the IKB file which is a CSV file and by default located in the [IKB](src/ikb/) folder. + It contains three columns - Library, Language, Category. The set of categories is defined in the + [concept map file](src/concept_map/). +* _sp_null_libs_file_ - This is the path to the null_libs file which is also a CSV file containing two columns - + Library, Language. Its default value is src/ikb/null_libs.csv. + +## Running + +### Launched Command Line Options +The following command line arguments are available in addition to +the options provided by +the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). +``` + --sp_ikb_file SP_IKB_FILE + Path to the IKB file + --sp_null_libs_file SP_NULL_LIBS_FILE + Path to the file to store the libraries for which no match could be found in the IKB +``` + +| Parameter | Default | Description | +|------------|----------|--------------| +| `SP_IKB_FILE` | `ikb/ikb_model.csv` | Path to IKB file. | +| `SP_NULL_LIBS_FILE` | `ikb/null_libs.csv` | Path to file in which libraries with no matching entries in IKB are recorded. | + +These correspond to the configuration keys described above. + +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/sp_transform.py using command line args +* `run-local-sample` - runs src/sp_local.py + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then + +```shell +ls output +``` +To see results of the transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + diff --git a/transforms/code/semantic_profiler/python/pyproject.toml b/transforms/code/semantic_profiler/python/pyproject.toml new file mode 100644 index 000000000..01ea445f1 --- /dev/null +++ b/transforms/code/semantic_profiler/python/pyproject.toml @@ -0,0 +1,50 @@ +[project] +name = "dpk_sp_transform_python" +version = "0.2.1.dev0" +requires-python = ">=3.10" +description = "SemanticProfiler Python Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Aishwariya Chakraborty", email = "aishwariya.chakraborty1@ibm.com" }, +] +dependencies = [ + "data-prep-toolkit==0.2.1.dev0", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +'src' = ['*.csv'] \ No newline at end of file diff --git a/transforms/code/semantic_profiler/python/src/concept_map/updated_concept_list.csv b/transforms/code/semantic_profiler/python/src/concept_map/updated_concept_list.csv new file mode 100644 index 000000000..685d62d3d --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/concept_map/updated_concept_list.csv @@ -0,0 +1,14 @@ +Category +Algorithms and Data Structures +Database Management +File Handling +Networking and Messaging +Graphical User Interface Design +Security +Scheduling and Concurrency +Logging and Monitoring +Web Development +Mathematics and Numerics +Code Analysis and Linting +Testing +Data Serialization \ No newline at end of file diff --git a/transforms/code/semantic_profiler/python/src/examples/examples-i.csv b/transforms/code/semantic_profiler/python/src/examples/examples-i.csv new file mode 100644 index 000000000..639735b66 --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/examples/examples-i.csv @@ -0,0 +1,27 @@ +Library,Language +algorithms,Python +asyncio,Python +arrow,Python +authlib,Python +webassets,Python +scipy,Python +pymysql,Python +mimetypes,Python +logging,Python +flake8,Python +mamba,Python +marshmallow,Python +tkinter,Python +com.leansoft.bigqueue,Java +com.cisco.commons.networking,Java +net.time4j,Java +org.apache.shiro,Java +java.net.http,Java +org.apache.commons.math4,Java +ch.vorburger.mariaDB4j,Java +com.google.jimfs,Java +java.logging,Java +org.sonar,Java +org.junit,Java +com.cedarsoftware:json-io,Java +java.desktop,Java diff --git a/transforms/code/semantic_profiler/python/src/examples/examples-o.csv b/transforms/code/semantic_profiler/python/src/examples/examples-o.csv new file mode 100644 index 000000000..b7eb9397a --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/examples/examples-o.csv @@ -0,0 +1,27 @@ +Library,Language,Category +algorithms,Python,Algorithms and Data Structures +asyncio,Python,Networking and Messaging +arrow,Python,Scheduling and Concurrency +authlib,Python,Security +webassets,Python,Web Development +scipy,Python,Mathematics and Numerics +pymysql,Python,Database Management +mimetypes,Python,File Handling +logging,Python,Logging and Monitoring +flake8,Python,Code Analysis and Linting +mamba,Python,Testing +marshmallow,Python,Data Serialization +tkinter,Python,Graphical User Interface Design +com.leansoft.bigqueue,Java,Algorithms and Data Structures +com.cisco.commons.networking,Java,Networking and Messaging +net.time4j,Java,Scheduling and Concurrency +org.apache.shiro,Java,Security +java.net.http,Java,Web Development +org.apache.commons.math4,Java,Mathematics and Numerics +ch.vorburger.mariaDB4j,Java,Database Management +com.google.jimfs,Java,File Handling +java.logging,Java,Logging and Monitoring +org.sonar,Java,Code Analysis and Linting +org.junit,Java,Testing +com.cedarsoftware:json-io,Java,Data Serialization +java.desktop,Java,Graphical User Interface Design \ No newline at end of file diff --git a/transforms/code/semantic_profiler/python/src/ikb/ikb_model.csv b/transforms/code/semantic_profiler/python/src/ikb/ikb_model.csv new file mode 100644 index 000000000..bda9d2a66 --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/ikb/ikb_model.csv @@ -0,0 +1,1021 @@ +Library,Language,Category +dynamic_bitset,Cpp,Algorithms and Data Structures +tries,Cpp,Algorithms and Data Structures +algorithm,Cpp,Algorithms and Data Structures +uni-algo,Cpp,Algorithms and Data Structures +boost.asio,Cpp,Networking and Messaging +cpp-netlib,Cpp,Networking and Messaging +zmq,Cpp,Networking and Messaging +azmq,Cpp,Networking and Messaging +thread-pool,Cpp,Scheduling and Concurrency +chrono,Cpp,Scheduling and Concurrency +concurrencpp,Cpp,Scheduling and Concurrency +time,Cpp,Scheduling and Concurrency +libressl,Cpp,Security +libgcrypt,Cpp,Security +nettle,Cpp,Security +digestpp,Cpp,Security +libonion,Cpp,Web Development +cpp-httplib,Cpp,Web Development +jwt-cpp,Cpp,Security +libfv,Cpp,Mathematics and Numerics +blaze,Cpp,Mathematics and Numerics +cnl,Cpp,Mathematics and Numerics +eigen,Cpp,Mathematics and Numerics +linalg,Cpp,Mathematics and Numerics +clickhouse,Cpp,Database Management +leveldb,Cpp,Database Management +libpqxx,Cpp,Database Management +sqlite,Cpp,Database Management +filesystem,Cpp,File Handling +llfio,Cpp,File Handling +glob,Cpp,File Handling +tinydir,Cpp,File Handling +spdlog,Cpp,Logging and Monitoring +boost.log,Cpp,Logging and Monitoring +glog,Cpp,Logging and Monitoring +reckless,Cpp,Algorithms and Data Structures +clang-tidy,Cpp,Code Analysis and Linting +clangd,Cpp,Code Analysis and Linting +cquery,Cpp,Code Analysis and Linting +cppcheck,Cpp,Code Analysis and Linting +boost.test,Cpp,Testing +benchmark,Cpp,Testing +cpputest,Cpp,Testing +ctest,Cpp,Testing +dlib,Cpp,Algorithms and Data Structures +blitz,Cpp,Algorithms and Data Structures +armadillo,Cpp,Algorithms and Data Structures +oneapi/dal,Cpp,Database Management +frozen,Cpp,Data Serialization +glaze,Cpp,Data Serialization +cppcodec,Cpp,Data Serialization +boost.serialization,Cpp,Data Serialization +infra,Cpp,Networking and Messaging +workflow,Cpp,Scheduling and Concurrency +taskflow,Cpp,Scheduling and Concurrency +libthrift,Cpp,Networking and Messaging +cegui,Cpp,Graphical User Interface Design +wxwidgets,Cpp,Graphical User Interface Design +gtk,Cpp,Graphical User Interface Design +nanogui,Cpp,Graphical User Interface Design +com.leansoft.bigqueue,Java,Algorithms and Data Structures +com.liveramp.hyperminhash,Java,Algorithms and Data Structures +org.pcollections,Java,Algorithms and Data Structures +org.ojalgo,Java,Algorithms and Data Structures +com.cisco.commons.networking,Java,Networking and Messaging +io.netty,Java,Networking and Messaging +org.apache.kafka,Java,Networking and Messaging +com.rabbitmq,Java,Networking and Messaging +net.time4j,Java,Scheduling and Concurrency +org.jobrunr:jobrunr,Java,Scheduling and Concurrency +org.quartz,Java,Scheduling and Concurrency +org.knowm.sundial,Java,Scheduling and Concurrency +org.apache.shiro,Java,Security +org.bouncycastle,Java,Security +jdk.crypto.cryptoki,Java,Security +jdk.security,Java,Security +java.net.http,Java,Web Development +jdk.httpserver,Java,Web Development +io.activej.codegen,Java,Code Analysis and Linting +ninja,Java,Code Analysis and Linting +org.apache.commons.math4,Java,Mathematics and Numerics +org.apache.commons.numbers,Java,Mathematics and Numerics +org.apache.commons.rng,Java,Mathematics and Numerics +com.mathLibrary,Java,Mathematics and Numerics +ch.vorburger.mariaDB4j,Java,Database Management +java.sql,Java,Database Management +redis.clients.jedis,Java,Database Management +org.jooq,Java,Database Management +com.google.jimfs,Java,File Handling +java.io,Java,File Handling +java.nio.file,Java,File Handling +org.apache.commons.vfs2,Java,File Handling +java.logging,Java,Logging and Monitoring +jdk.jconsole,Java,IT Automation +java.util.logging,Java,Logging and Monitoring +org.slf4j.Logger,Java,Logging and Monitoring +org.sonar,Java,Code Analysis and Linting +fr.inria.gforge.spoon,Java,Code Analysis and Linting +com.puppycrawl.tools.checkstyle,Java,Code Analysis and Linting +net.sourceforge.pmd,Java,Code Analysis and Linting +org.junit,Java,Testing +com.intuit.karate,Java,Testing +org.mockito,Java,Testing +org.apache.jmeter,Java,Testing +org.influxdb,Java,Data Analysis +org.apache.spark,Java,Data Analysis +org.apache.flink,Java,Data Analysis +weka,Java,Data Analysis +com.cedarsoftware:json-io,Java,Data Serialization +com.google.flatbuffers,Java,Data Serialization +org.msgpack,Java,Data Serialization +com.esotericsoftware.kryo,Java,Data Serialization +jenkins.model.Jenkins,Java,IT Automation +org.apache.maven,Java,IT Automation +org.gradle,Java,IT Automation +com.microsoft.terraform,Java,IT Automation +java.desktop,Java,Graphical User Interface Design +java.awt,Java,Graphical User Interface Design +org.openjfx,Java,Graphical User Interface Design +org.eclipse.swt,Java,Graphical User Interface Design +ngraph.graph,JavaScript,Algorithms and Data Structures +buckets,JavaScript,Algorithms and Data Structures +mori,JavaScript,Algorithms and Data Structures +graphlib,JavaScript,Algorithms and Data Structures +socket.io,JavaScript,Networking and Messaging +request,JavaScript,Web Development +amqplib,JavaScript,Networking and Messaging +mqtt,JavaScript,Networking and Messaging +fullcalendar,JavaScript,Graphical User Interface Design +later,JavaScript,Scheduling and Concurrency +date-fns,JavaScript,Mathematics and Numerics +Moment,JavaScript,Mathematics and Numerics +helmet,JavaScript,Security +bcrypt,JavaScript,Security +js-xss,JavaScript,Security +xss-filters,JavaScript,Security +vue,JavaScript,Graphical User Interface Design +react,JavaScript,Graphical User Interface Design +express,JavaScript,Web Development +angular,JavaScript,Graphical User Interface Design +Polynomial,JavaScript,Mathematics and Numerics +Numeral-js,JavaScript,Mathematics and Numerics +accounting,JavaScript,Mathematics and Numerics +odometer,JavaScript,Mathematics and Numerics +datavore,JavaScript,Data Analysis +DB,JavaScript,Database Management +sql,JavaScript,Database Management +NeDB,JavaScript,Database Management +jStorage,JavaScript,Database Management +store,JavaScript,Database Management +cross-storage,JavaScript,File Handling +localForage,JavaScript,File Handling +console.log-wrapper,JavaScript,Logging and Monitoring +storybook,JavaScript,Graphical User Interface Design +minilog,JavaScript,Logging and Monitoring +loglevel,JavaScript,Logging and Monitoring +eslint,JavaScript,Code Analysis and Linting +jshint,JavaScript,Code Analysis and Linting +tslint,JavaScript,Code Analysis and Linting +sonarqube,JavaScript,Code Analysis and Linting +jest,JavaScript,Testing +Cypress,JavaScript,Testing +jasmine,JavaScript,Testing +qunit,JavaScript,Testing +fabric,JavaScript,Web Development +d3,JavaScript,Graphical User Interface Design +three,JavaScript,Graphical User Interface Design +sigma,JavaScript,Graphical User Interface Design +tempo,JavaScript,Graphical User Interface Design +jsfmt,JavaScript,Data Serialization +fecha,JavaScript,Data Serialization +protobufjs,JavaScript,Data Serialization +shelljs,JavaScript,IT Automation +forever,JavaScript,Scheduling and Concurrency +node-cron,JavaScript,Scheduling and Concurrency +jenkins,JavaScript,IT Automation +react,JavaScript,Web Development +vue,JavaScript,Web Development +electron,JavaScript,Web Development +angular,JavaScript,Web Development +stdgpu,C,Algorithms and Data Structures +urdfdom,C,Algorithms and Data Structures +cxxgraph,C,Algorithms and Data Structures +metis,C,Algorithms and Data Structures +nanomsg,C,Networking and Messaging +curl,C,Web Development +librabbitmq,C,Networking and Messaging +mosquitto,C,Networking and Messaging +uv,C,Scheduling and Concurrency +time,C,Scheduling and Concurrency +pth,C,Scheduling and Concurrency +pthread,C,Scheduling and Concurrency +OpenSSL,C,Security +GnuTLS,C,Security +libsodium,C,Security +libgcrypt,C,Security +facil.io,C,File Handling +kcgi,C,Web Development +KLone,C,Web Development +civetweb,C,Web Development +apophenia,C,Data Analysis +cmathl,C,Mathematics and Numerics +GSL,C,Mathematics and Numerics +SLEPc,C,Mathematics and Numerics +DuckDB,C,Database Management +MySQL,C,Database Management +sophia,C,Database Management +SQLite,C,Database Management +stdio,C,File Handling +POSIX,C,IT Automation +HDF5,C,File Handling +fstream,C,File Handling +syslog,C,Logging and Monitoring +spdlog,C,Logging and Monitoring +collectd,C,Data Analysis +nagios-plugins,C,IT Automation +libclang,C,Code Analysis and Linting +Cppcheck,C,Code Analysis and Linting +libclang-tidy,C,Code Analysis and Linting +Infer,C,Code Analysis and Linting +CMocka,C,Testing +MinUnit,C,Testing +Valgrind,C,Testing +Check,C,Testing +gsl-lite,C,Mathematics and Numerics +libcsv,C,Data Analysis +dataframe,C,Data Analysis +iqa,C,Data Analysis +libyaml,C,Data Serialization +libfmt,C,Data Serialization +flatbuffers,C,Data Serialization +msgpack-c,C,Data Serialization +nix_api_util,C,IT Automation +libcircmetrics,C,Logging and Monitoring +etcd-api,C,Networking and Messaging +cetcd,C,Networking and Messaging +microui,C,Graphical User Interface Design +tinyfiledialogs,C,Graphical User Interface Design +luigi ,C,IT Automation +GTK,C,Graphical User Interface Design +Akade.IndexedSet,C#,Algorithms and Data Structures +Akka.DistributedData,C#,Algorithms and Data Structures +dotnet-mgcb-compute,C#,Mathematics and Numerics +QuantConnect.Algorithm.CSharp,C#,Algorithms and Data Structures +Microsoft.AspNetCore.Connections,C#,Networking and Messaging +System.Net.Http.WinHttpHandler,C#,Web Development +Microsoft.AspNetCore.WebUtilities,C#,Web Development +MessagePipe,C#,Networking and Messaging +Microsoft.SemanticKernel.Plugins.MsGraph,C#,Algorithms and Data Structures +System.Threading.Tasks,C#,Scheduling and Concurrency +Hangfire,C#,Scheduling and Concurrency +OrchardCore.PublishLater,C#,Scheduling and Concurrency +CefSharp.WinForm.Net.Core,C#,Graphical User Interface Design +System.DirectoryServices.AccountManagement,C#,IT Automation +System.Security.Permissions,C#,Security +System.Security.AccessControl,C#,Security +@pavelsavara/dotnet-runtime,C#,IT Automation +@abp/ng.oauth,C#,Security +@abp/core,C#,Web Development +@abp/ng.components,C#,Web Development +SharpDX.Mathematics,C#,Mathematics and Numerics +AvaloniaMath,C#,Mathematics and Numerics +WpfMath,C#,Mathematics and Numerics +NCalcSync,C#,Mathematics and Numerics +microsoft.entityframeworkcore.tools,C#,Database Management +Dapper,C#,Database Management +Microsoft.Azure.Management.PostgreSQL,C#,Database Management +Microsoft.Azure.Management.CosmosDB,C#,Database Management +Reloaded.Mod.Loader.IO,C#,File Handling +DICOMcloud,C#,Data Analysis +Aurio,C#,Graphical User Interface Design +SeekableS3Stream,C#,File Handling +Microsoft.Extensions.Logging,C#,Logging and Monitoring +Microsoft.Azure.Management.Profiles.hybrid_2019_03_01.Monitor,C#,IT Automation +Azure.Monitor.OpenTelemetry.AspNetCore,C#,Logging and Monitoring +Microsoft.AspNetCore.Identity,C#,Security +roslyn,C#,Code Analysis and Linting +Microsoft.Toolkit.Uwp.PlatformSpecificAnalyzer,C#,Code Analysis and Linting +Uno.Microsoft.Toolkit.Uwp.PlatformSpecificAnalyzer,C#,Code Analysis and Linting +Microsoft.CST.ApplicationInspector.Common,C#,Code Analysis and Linting +Microsoft.AspNetCore.TestHost,C#,Testing +Microsoft.AspNetCore.Mvc.Testing,C#,Testing +Microsoft.AspNetCore.SignalR.Specification.Tests,C#,Testing +KIF,C#,Algorithms and Data Structures +Microsoft.Data.Analysis,C#,Data Analysis +Azure.Media.VideoAnalyzer.Edge,C#,Data Analysis +Google.Cloud.Trace.V1,C#,Logging and Monitoring +ClosedXML.Report,C#,Data Serialization +System.Formats,C#,Data Serialization +System.IO.Ports,C#,File Handling +System.Text.Json,C#,Data Serialization +App.Metrics.Formatters.Graphite,C#,Logging and Monitoring +Microsoft.Crank.AzureDevOpsWorker,C#,IT Automation +AWSSDK.DevOpsGuru,C#,IT Automation +Microsoft.SourceLink.AzureDevOpsServer.Git,C#,IT Automation +Saritasa.Tools.Messages.TestRuns,C#,Testing +SSRD.IdentityUI,C#,Security +bashforms,C#,Graphical User Interface Design +NSCI,C#,Algorithms and Data Structures +WSCT.GUI,C#,Graphical User Interface Design +lock-free,D,Algorithms and Data Structures +liblfdsd,D,Algorithms and Data Structures +bitranged,D,Algorithms and Data Structures +dstruct,D,Algorithms and Data Structures +vibe-d,D,Web Development +hunt-net,D,Networking and Messaging +nbuff,D,Algorithms and Data Structures +collie,D,Algorithms and Data Structures +photon,D,Algorithms and Data Structures +scheduled,D,Scheduling and Concurrency +meta,D,Code Analysis and Linting +ctini,D,Security +hunt-security,D,Security +hunt-shiro,D,Security +secured,D,Security +csprng,D,Security +pgator-backend,D,Web Development +hunt-cache,D,Data Analysis +formoshlep,D,Data Analysis +web-config,D,Web Development +simple-math,D,Mathematics and Numerics +evalex,D,Mathematics and Numerics +dualnumbers,D,Mathematics and Numerics +tau,D,Mathematics and Numerics +mysql-native,D,Database Management +derelict-pq,D,Database Management +ddbc,D,Database Management +dpq2,D,Database Management +inifiled,D,File Handling +fswatch,D,File Handling +tinyfiledialogs,D,Graphical User Interface Design +thepath,D,File Handling +hunt,D,Testing +gogga,D,Data Analysis +dlog,D,Logging and Monitoring +colorlog,D,Logging and Monitoring +code_checker,D,Code Analysis and Linting +dfmt,D,Data Serialization +dscanner,D,Code Analysis and Linting +dparse,D,Algorithms and Data Structures +silly,D,Algorithms and Data Structures +unit-threaded,D,Testing +fluent-asserts,D,Testing +dests,D,Algorithms and Data Structures +magpie,D,Algorithms and Data Structures +dvec,D,Mathematics and Numerics +d-tree,D,Algorithms and Data Structures +d_dataframes,D,Data Analysis +jsonizer,D,Data Serialization +mir-ion,D,Algorithms and Data Structures +protobuf,D,Data Serialization +siryul,D,Security +iup,D,Graphical User Interface Design +declui,D,Graphical User Interface Design +d_imgui,D,Graphical User Interface Design +dlangui,D,Graphical User Interface Design +libgit2,D,Database Management +yamkeys,D,Security +lua-jit-d,D,IT Automation +led,D,Graphical User Interface Design +array-tool,Rust,Algorithms and Data Structures +petgraph,Rust,Algorithms and Data Structures +heapless,Rust,Algorithms and Data Structures +argon2,Rust,Security +mio,Rust,Networking and Messaging +actix-rt,Rust,Scheduling and Concurrency +socket2,Rust,Networking and Messaging +crossbeam-channel,Rust,Networking and Messaging +cron,Rust,Scheduling and Concurrency +crossbeam-deque,Rust,Algorithms and Data Structures +smolscale,Rust,Data Analysis +job_scheduler,Rust,Scheduling and Concurrency +zeroize,Rust,Security +rocket,Rust,Web Development +rpassword,Rust,Security +trust-dns-resolver,Rust,Networking and Messaging +@farmfe/core,Rust,IT Automation +wasmer-clif-fork-frontend,Rust,Web Development +seed,Rust,Graphical User Interface Design +@farmfe/cli,Rust,IT Automation +num-traits,Rust,Mathematics and Numerics +num,Rust,Mathematics and Numerics +num-bigint,Rust,Mathematics and Numerics +cgmath,Rust,Mathematics and Numerics +rusqlite,Rust,Database Management +redis,Rust,Database Management +diesel,Rust,Database Management +postgres,Rust,Database Management +fs_extra,Rust,File Handling +toml,Rust,Data Serialization +tempfile,Rust,File Handling +zip,Rust,File Handling +log,Rust,Logging and Monitoring +env_logger,Rust,Logging and Monitoring +tracing,Rust,Logging and Monitoring +slog,Rust,Logging and Monitoring +@cubejs-backend/linter,Rust,Code Analysis and Linting +selene-lib,Rust,Data Analysis +ast-grep,Rust,Code Analysis and Linting +cargo-crev,Rust,Code Analysis and Linting +assert_cmd,Rust,Testing +quickcheck,Rust,Testing +proptest,Rust,Testing +wasm-bindgen-test,Rust,Testing +rls-analysis,Rust,Code Analysis and Linting +rstats,Rust,Data Analysis +amadeus-commoncrawl,Rust,Data Analysis +opendp,Rust,Data Analysis +serde,Rust,Data Serialization +serde_json,Rust,Data Serialization +serde_yaml,Rust,Data Serialization +bincode,Rust,Data Serialization +lsio,Rust,File Handling +shuttle-runtime,Rust,IT Automation +rustc_data_structures,Rust,Algorithms and Data Structures +compiler_base_span,Rust,Algorithms and Data Structures +slint,Rust,Algorithms and Data Structures +qinpel-wiz,Rust,Algorithms and Data Structures +arc,Rust,Algorithms and Data Structures +cushy,Rust,Algorithms and Data Structures +tumblr/XExtensionItem,Objective-C,Algorithms and Data Structures +TBQuadTree,Objective-C,Algorithms and Data Structures +POSDataStructures,Objective-C,Algorithms and Data Structures +PESGraph,Objective-C,Algorithms and Data Structures +AFNetworking,Objective-C,Networking and Messaging +CocoaAsyncSocket,Objective-C,Networking and Messaging +Atlas,Objective-C,Graphical User Interface Design +RestKit,Objective-C,Web Development +SZServerTimeManager,Objective-C,Scheduling and Concurrency +CalendarLib,Objective-C,Scheduling and Concurrency +Selene,Objective-C,Security +ZMJGanttChart,Objective-C,Graphical User Interface Design +AWSCognitoIdentityProviderASF,Objective-C,Security +gObfuscator,Objective-C,Security +Lockbox,Objective-C,Security +STPrivilegedTask,Objective-C,IT Automation +vtx,Objective-C,Algorithms and Data Structures +ColendiWebViewSDK,Objective-C,Web Development +@abp/bootstrap-daterangepicker,Objective-C,Web Development +@abp/ng.oauth,Objective-C,Security +vMAT,Objective-C,Mathematics and Numerics +crlibm,Objective-C,Mathematics and Numerics +MCKNumerics,Objective-C,Mathematics and Numerics +ACMatrix,Objective-C,Mathematics and Numerics +DKDBManager,Objective-C,Database Management +FlexileDatabase,Objective-C,Database Management +KKDSqlite,Objective-C,Database Management +SNDBManager,Objective-C,Database Management +APSmartStorage,Objective-C,File Handling +zipzap,Objective-C,File Handling +AliyunOSSiOS,Objective-C,File Handling +YTKKeyValueStore,Objective-C,Data Serialization +github.com/github.com/CocoaLumberjack/CocoaLumberjack,Objective-C,Logging and Monitoring +VENVersionTracker,Objective-C,IT Automation +NSLogger,Objective-C,Logging and Monitoring +NetworkEye,Objective-C,Networking and Messaging +nq-test-react-native-maps,Objective-C,Graphical User Interface Design +KIF,Objective-C,Testing +facebookarchive/xctool,Objective-C,Code Analysis and Linting +xctool,Objective-C,Code Analysis and Linting +KRGreyTheory,Objective-C,Mathematics and Numerics +DataGrinch,Objective-C,Data Analysis +XsdaKit,Objective-C,Data Serialization +cordova-pgyer-dandelion,Objective-C,Web Development +sbjson,Objective-C,Data Serialization +FXParser,Objective-C,Data Analysis +CSV,Objective-C,Data Analysis +NSMutableData+MultipartFormData,Objective-C,File Handling +Masonry,Objective-C,Graphical User Interface Design +Chameleon,Objective-C,Graphical User Interface Design +Nimbus,Objective-C,Graphical User Interface Design +GPUImage,Objective-C,Graphical User Interface Design +infer,Objective-C,Code Analysis and Linting +OCLint,Objective-C,Code Analysis and Linting +sonatype,Objective-C,IT Automation +sigrid,Objective-C,IT Automation +fastlane,Objective-C,IT Automation +hammerspoon,Objective-C,Graphical User Interface Design +punic,Objective-C,IT Automation +jenkins-mobile-pipeline-shared-libraries,Objective-C,IT Automation +brotli,Ocaml,Data Compression +dtoa,Ocaml,Algorithms and Data Structures +bin_tree,Ocaml,Algorithms and Data Structures +base_trie,Ocaml,Algorithms and Data Structures +apero-net,Ocaml,Networking and Messaging +conduit,Ocaml,Networking and Messaging +netamqp,Ocaml,Networking and Messaging +posix-mqueue,Ocaml,File Handling +bap-primus-exploring-scheduler,Ocaml,Scheduling and Concurrency +builder,Ocaml,IT Automation +daypack-lib,Ocaml,Data Analysis +riot,Ocaml,Web Development +tls,Ocaml,Security +osx-acl,Ocaml,Security +content_security_policy,Ocaml,Security +aws-sts,Ocaml,Security +async_websocket,Ocaml,Web Development +benchpress-server,Ocaml,Web Development +builder-web,Ocaml,Web Development +cduce_ws,Ocaml,Web Development +posix-math,Ocaml,Mathematics and Numerics +smol,Ocaml,Data Serialization +crlibm,Ocaml,Mathematics and Numerics +lem,Ocaml,Code Analysis and Linting +caqti,Ocaml,Database Management +dbforge,Ocaml,Database Management +irmin,Ocaml,Database Management +links-mysql,Ocaml,Database Management +bitlib,Ocaml,Algorithms and Data Structures +chamelon,Ocaml,Web Development +fpath,Ocaml,File Handling +fileutils,Ocaml,File Handling +bolt,Ocaml,Algorithms and Data Structures +dolog,Ocaml,Logging and Monitoring +easy_logging,Ocaml,Logging and Monitoring +loga,Ocaml,Logging and Monitoring +bisect_ppx,Ocaml,Code Analysis and Linting +calli,Ocaml,Algorithms and Data Structures +clangml-transforms,Ocaml,Algorithms and Data Structures +dolmen_bin,Ocaml,Algorithms and Data Structures +base_quickcheck,Ocaml,Testing +caravan,Ocaml,Web Development +kaputt,Ocaml,Algorithms and Data Structures +ounit2,Ocaml,Testing +conformist,Ocaml,Code Analysis and Linting +dataframe,Ocaml,Data Analysis +dsfo,Ocaml,Data Analysis +llama_midi,Ocaml,Graphical User Interface Design +atdgen,Ocaml,Code Analysis and Linting +bitpack_serializer,Ocaml,Data Serialization +coq-serapi,Ocaml,Algorithms and Data Structures +grpc,Ocaml,Networking and Messaging +bap-build,Ocaml,IT Automation +argsh,Ocaml,IT Automation +conf-automake,Ocaml,IT Automation +dtools,Ocaml,IT Automation +bogue,Ocaml,Algorithms and Data Structures +unison-gui,Ocaml,Graphical User Interface Design +imguiml,Ocaml,Graphical User Interface Design +altgr-ergo,Ocaml,Algorithms and Data Structures +bk-tree,Haskell,Algorithms and Data Structures +algebraic-graphs,Haskell,Algorithms and Data Structures +recursion-schemes,Haskell,Algorithms and Data Structures +AvlTree,Haskell,Algorithms and Data Structures +grenade,Haskell,Security +network-conduit,Haskell,Networking and Messaging +streamly,Haskell,Algorithms and Data Structures +hedgehog,Haskell,Testing +haxl,Haskell,Web Development +amazonka-scheduler,Haskell,Scheduling and Concurrency +massiv-scheduler,Haskell,Scheduling and Concurrency +gogol-datafusion,Haskell,Data Analysis +tamarin-prover-theory,Haskell,Mathematics and Numerics +tamarin-prover,Haskell,Mathematics and Numerics +yst,Haskell,Data Analysis +fireward,Haskell,Security +snap-core,Haskell,Web Development +snap-server,Haskell,Web Development +gogol-pagespeed,Haskell,Web Development +gogol-indexing,Haskell,Data Analysis +pandoc,Haskell,Data Serialization +Agda,Haskell,Mathematics and Numerics +math-functions,Haskell,Mathematics and Numerics +commodities,Haskell,Data Analysis +gogol-spanner,Haskell,Database Management +gogol-sqladmin,Haskell,Database Management +gogol-datastore,Haskell,Database Management +dbmigrations,Haskell,Database Management +bytestring,Haskell,File Handling +io-streams,Haskell,File Handling +regions,Haskell,Algorithms and Data Structures +amazonka-kinesis-video-webrtc-storage,Haskell,Data Analysis +tensorflow-logging,Haskell,Logging and Monitoring +wai-extra,Haskell,Web Development +co-log,Haskell,Logging and Monitoring +gogol-cloudmonitoring,Haskell,IT Automation +pandoc,Haskell,Data Serialization +cassava,Haskell,Data Analysis +commonmark,Haskell,Data Serialization +auto,Haskell,Code Analysis and Linting +amazonka-devops-guru,Haskell,IT Automation +deptrack-devops,Haskell,IT Automation +gogol-testing,Haskell,Testing +LogicGrowsOnTrees,Haskell,Algorithms and Data Structures +gogol-datafusion,Haskell,Data Analysis +vty-ui,Haskell,Graphical User Interface Design +YampaSynth,Haskell,Algorithms and Data Structures +master-plan,Haskell,IT Automation +stan,Haskell,Data Analysis +hlint,Haskell,Code Analysis and Linting +liquidhaskell,Haskell,Code Analysis and Linting +ghc,Haskell,IT Automation +purescript,Haskell,Code Analysis and Linting +ghcide-test-utils,Haskell,Testing +hls-test-utils,Haskell,Testing +yesod-test,Haskell,Testing +statistics,Haskell,Mathematics and Numerics +statistics-skinny,Haskell,Mathematics and Numerics +ajhc,Haskell,Code Analysis and Linting +fortran-src,Haskell,Algorithms and Data Structures +BitVector,Nim,Algorithms and Data Structures +rbtree,Nim,Algorithms and Data Structures +binaryheap,Nim,Algorithms and Data Structures +algorithm,Nim,Algorithms and Data Structures +nativesockets,Nim,Networking and Messaging +net,Nim,Networking and Messaging +nimrdkafka,Nim,Networking and Messaging +mqtt,Nim,Networking and Messaging +monotimes,Nim,Scheduling and Concurrency +times,Nim,Scheduling and Concurrency +osproc,Nim,IT Automation +schedules,Nim,Scheduling and Concurrency +nimcrypt,Nim,Security +seccomp,Nim,Security +nimpass,Nim,Security +quickcrypt,Nim,Security +nerve,Nim,Networking and Messaging +palladian,Nim,Web Development +staticserver,Nim,Web Development +phoon,Nim,Web Development +seqmath,Nim,Mathematics and Numerics +extmath,Nim,Mathematics and Numerics +geometrymath,Nim,Mathematics and Numerics +neo,Nim,Database Management +niledb,Nim,Database Management +couchdb,Nim,Database Management +zfdbms,Nim,Database Management +pdba,Nim,Database Management +osfiles,Nim,File Handling +fileinput,Nim,File Handling +filetype,Nim,File Handling +stor,Nim,File Handling +octolog,Nim,Logging and Monitoring +morelogging,Nim,Logging and Monitoring +promexplorer,Nim,Data Analysis +metrics,Nim,Data Analysis +nimfmt,Nim,Code Analysis and Linting +coco,Nim,Code Analysis and Linting +treesitter,Nim,Code Analysis and Linting +nimalyzer,Nim,Code Analysis and Linting +testify,Nim,Testing +nimtest,Nim,Testing +testutils,Nim,Testing +halonium,Nim,Networking and Messaging +nimdata,Nim,Data Analysis +datamancer,Nim,Data Analysis +nimdataframe,Nim,Data Analysis +mpfit,Nim,Mathematics and Numerics +tomlserialization,Nim,Data Serialization +protobufserialization,Nim,Data Serialization +bson,Nim,Data Serialization +eminim,Nim,Algorithms and Data Structures +autome,Nim,IT Automation +monit,Nim,Logging and Monitoring +autonim,Nim,IT Automation +nake,Nim,IT Automation +nimblegui,Nim,Graphical User Interface Design +nigui,Nim,Graphical User Interface Design +sigui,Nim,Graphical User Interface Design +rdgui,Nim,Graphical User Interface Design +de.sciss:fingertree_2.11,Scala,Algorithms and Data Structures +org.scalameta:semanticdb-scalac-core_2.11.12,Scala,Code Analysis and Linting +org.axle-lang:axle-algorithms_2.11,Scala,Algorithms and Data Structures +de.sciss:strugatzki_2.10,Scala,Algorithms and Data Structures +org.apache.spark:spark-network-common_2.11,Scala,Networking and Messaging +com.github.molecule-labs:molecule-net_2.9.3,Scala,Networking and Messaging +org.elasticmq,Scala,Database Management +com.typesafe.akka:akka-stream_2.12,Scala,Networking and Messaging +com.miguno.akka:akka-mock-scheduler_2.11,Scala,Scheduling and Concurrency +com.enragedginger:akka-quartz-scheduler_2.11,Scala,Scheduling and Concurrency +edu.gemini:lucuma-typed-scheduler_sjs1_3,Scala,Scheduling and Concurrency +io.getkyo:kyo-scheduler_2.13,Scala,Scheduling and Concurrency +dev.zio:zio-json_3,Scala,Data Serialization +dev.zio:zio-json_2.12,Scala,Data Serialization +recheck,Scala,Code Analysis and Linting +org.beangle.security:beangle-security-core,Scala,Security +com.softwaremill.sttp:async-http-client-backend-future_2.12,Scala,Web Development +com.softwaremill.sttp:akka-http-backend_2.12,Scala,Web Development +com.eed3si9n:gigahorse-okhttp_2.12,Scala,Web Development +com.softwaremill.sttp.client3:slf4j-backend_2.12,Scala,Logging and Monitoring +com.github.vagmcs:optimus_2.11,Scala,Mathematics and Numerics +com.github.vagmcs:optimus-solver-oj_2.11,Scala,Mathematics and Numerics +io.github.scalamath:vecmatlib,Scala,Mathematics and Numerics +io.github.scalamath:cmplxlib,Scala,Mathematics and Numerics +com.typesafe.slick:slick_2.11,Scala,Database Management +org.tpolecat:doobie-core_2.12,Scala,Database Management +org.reactivemongo:reactivemongo_2.11,Scala,Database Management +org.tpolecat:doobie-postgres_2.12,Scala,Database Management +org.specs2:specs2_2.11,Scala,Testing +com.github.pathikrit:better-files_2.12,Scala,File Handling +com.github.scala-incubator.io:scala-io-file_2.10,Scala,File Handling +de.sciss:audiofile_2.11,Scala,Data Analysis +com.typesafe.scala-logging:scala-logging_2.12,Scala,Logging and Monitoring +com.typesafe.scala-logging:scala-logging-slf4j_2.11,Scala,Logging and Monitoring +org.clapper:grizzled-slf4j_2.11,Scala,Logging and Monitoring +com.outr:scribe_2.12,Scala,Data Serialization +org.psywerx.hairyfotr.linter,Scala,Code Analysis and Linting +scala.meta.parsers,Scala,Algorithms and Data Structures +org.scalastyle,Scala,Code Analysis and Linting +com.sksamuel.scapegoat,Scala,Code Analysis and Linting +org.scala-js:scalajs-test-bridge_2.13,Scala,Testing +org.scala-js:scalajs-test-interface_2.12,Scala,Testing +com.typesafe.play:play-test_2.11,Scala,Testing +org.scalatest:scalatest_2.9.1,Scala,Testing +org.finra.megasparkdiff:mega-spark-diff,Scala,Data Analysis +com.github.vicpara:exploratory-data-analysis_2.10,Scala,Data Analysis +org.emmalanguage:emma,Scala,Data Analysis +org.emmalanguage:emma-benchmarks,Scala,Data Analysis +org.simplex3d:simplex3d-data-format_2.10,Scala,Data Serialization +org.wvlet.airframe:airframe-tablet_2.13.0-RC2,Scala,Data Serialization +org.gnieh:fs2-data-text_2.13,Scala,Data Serialization +com.fasterxml.jackson.module:jackson-module-scala_2.12,Scala,Data Serialization +tech.orkestra:orkestra-core_sjs0.6_2.12,Scala,IT Automation +com.goyeau:orchestra-cron_2.12,Scala,Scheduling and Concurrency +com.aamend.spark:archetype,Scala,IT Automation +io.kevinlee:sbt-devoops-github-core_2.12_1.0,Scala,IT Automation +de.sciss:dotterweide-ui_2.11,Scala,Graphical User Interface Design +org.scala-lang.modules.scala-swing,Scala,Graphical User Interface Design +io.github.kacperfkorban.guinep-web,Scala,Web Development +io.github.mimoguz.layeredfonticon-core,Scala,Graphical User Interface Design +piecemeal,Dart,Algorithms and Data Structures +collection,Dart,Algorithms and Data Structures +pointycastle,Dart,Security +graphs,Dart,Algorithms and Data Structures +connectivity_plus,Dart,Networking and Messaging +cached_network_image,Dart,File Handling +connectivity,Dart,Networking and Messaging +firebase_messaging,Dart,Networking and Messaging +reflutter,Dart,Web Development +server_universe,Dart,Web Development +create-fullstack-app-cli,Dart,IT Automation +angel_graphql,Dart,Web Development +flutter_local_notifications,Dart,Graphical User Interface Design +cron,Dart,Scheduling and Concurrency +timer_builder,Dart,Scheduling and Concurrency +syncfusion_flutter_calendar,Dart,Graphical User Interface Design +google_sign_in,Dart,Security +mqtt_client,Dart,Networking and Messaging +angel_security,Dart,Security +envied,Dart,Code Analysis and Linting +math_expressions,Dart,Mathematics and Numerics +more,Dart,Algorithms and Data Structures +ml_linalg,Dart,Mathematics and Numerics +fixed,Dart,Algorithms and Data Structures +sqflite,Dart,Database Management +cloud_firestore,Dart,Database Management +postgres,Dart,Database Management +hive,Dart,Database Management +path_provider,Dart,File Handling +image,Dart,Graphical User Interface Design +glob,Dart,File Handling +file,Dart,File Handling +logging,Dart,Logging and Monitoring +logger,Dart,Logging and Monitoring +ansicolor,Dart,Logging and Monitoring +pretty_dio_logger,Dart,Logging and Monitoring +flutter_lints,Dart,Code Analysis and Linting +pedantic_mono,Dart,Code Analysis and Linting +carapacik_lints,Dart,Code Analysis and Linting +velvet_custom_lints,Dart,Code Analysis and Linting +test,Dart,Testing +unittest,Dart,Testing +build_test,Dart,Testing +mocktail,Dart,Testing +grizzly_array,Dart,Algorithms and Data Structures +flutter_insights,Dart,Data Analysis +packhorse,Dart,IT Automation +plugin_mappintelligence,Dart,IT Automation +yaml,Dart,Data Serialization +http_parser,Dart,Web Development +built_value,Dart,Data Serialization +bson,Dart,Data Serialization +unleash,Dart,IT Automation +docrunner,Dart,IT Automation +cobertura,Dart,Code Analysis and Linting +bitwarden_secrets,Dart,Security +magical_widget,Dart,Graphical User Interface Design +flutter_auto_gui,Dart,Graphical User Interface Design +gui_shape,Dart,Graphical User Interface Design +rinf,Dart,Algorithms and Data Structures +collections,Python,Algorithms and Data Structures +heapq,Python,Algorithms and Data Structures +algorithms,Python,Algorithms and Data Structures +sortedcontainers,Python,Algorithms and Data Structures +asyncio,Python,Networking and Messaging +socket,Python,Networking and Messaging +kafka-python,Python,Networking and Messaging +dramatiq,Python,Networking and Messaging +arrow,Python,Scheduling and Concurrency +dateutil,Python,Scheduling and Concurrency +threading-framework,Python,Scheduling and Concurrency +schedule,Python,Scheduling and Concurrency +authlib,Python,Security +pyjwt,Python,Security +django-allauth,Python,Security +cryptography,Python,Security +webassets,Python,Web Development +html2text,Python,Web Development +websockets,Python,Web Development +tornado,Python,Web Development +scipy,Python,Mathematics and Numerics +numpy,Python,Mathematics and Numerics +statsmodel,Python,Mathematics and Numerics +sympy,Python,Mathematics and Numerics +pymysql,Python,Database Management +psycopg,Python,Database Management +pymongo,Python,Database Management +pickledb,Python,Database Management +mimetypes,Python,File Handling +pathlib,Python,File Handling +python-magic,Python,File Handling +wqtchdog,Python,Scheduling and Concurrency +logging,Python,Logging and Monitoring +structlog,Python,Logging and Monitoring +loguru,Python,Logging and Monitoring +psutil,Python,System Administration +flake8,Python,Code Analysis and Linting +pyflakes,Python,Code Analysis and Linting +pycodestyle,Python,Code Analysis and Linting +pylint,Python,Code Analysis and Linting +mamba,Python,Testing +pytest,Python,Testing +unittest,Python,Testing +selenium,Python,Web Development +pandas,Python,Data Analysis +optimus,Python,Data Analysis +schema,Python,Data Analysis +pydantic,Python,Data Serialization +marshmallow,Python,Data Serialization +pysimdjson,Python,Data Serialization +json,Python,Data Serialization +prophy,Python,Data Analysis +ansible,Python,IT Automation +pyinfra,Python,IT Automation +fabric,Python,IT Automation +borg,Python,System Administration +tkinter,Python,Graphical User Interface Design +pyglet,Python,Graphical User Interface Design +pyqt,Python,Graphical User Interface Design +kivy,Python,Graphical User Interface Design +Graph,Perl,Algorithms and Data Structures +MetaMap-DataStructures,Perl,Algorithms and Data Structures +Array-Circular,Perl,Algorithms and Data Structures +Tree-R,Perl,Algorithms and Data Structures +NetAddr-MAC,Perl,Networking and Messaging +Net-OpenSSH,Perl,Networking and Messaging +Parse-IPCommand,Perl,Networking and Messaging +Net-SSH2,Perl,Networking and Messaging +docpad-plugin-scheduling,Perl,Scheduling and Concurrency +Async-Event-Interval,Perl,Scheduling and Concurrency +Schedule-SGELK,Perl,Scheduling and Concurrency +Mojolicious-Plugin-Cron-Scheduler,Perl,Scheduling and Concurrency +DBIx-Class-BcryptColumn,Perl,Security +Crypt-DRBG,Perl,Security +WWW-KeePassRest,Perl,Web Development +Plack-Middleware-SecureHeaders,Perl,Security +Mojolicious,Perl,Web Development +Dancer2,Perl,Web Development +Catalyst,Perl,Web Development +Kossy,Perl,Web Development +SPVM-Math,Perl,Mathematics and Numerics +App-Math-Tutor,Perl,Mathematics and Numerics +Math-RPN-Tiny,Perl,Mathematics and Numerics +Math-Sidef,Perl,Mathematics and Numerics +DBD-mysql,Perl,Database Management +Redis,Perl,Database Management +github.com/percona/percona-toolkit,Perl,Database Management +Database-Abstraction,Perl,Database Management +Path-Tiny,Perl,File Handling +File-Util,Perl,File Handling +PDF-API2,Perl,Data Serialization +IO-All,Perl,File Handling +CPAN-Testers-Schema,Perl,Data Analysis +Log-Report,Perl,Logging and Monitoring +Log-Contextual,Perl,Logging and Monitoring +event-tracer,Perl,Logging and Monitoring +Perl-Lint,Perl,Code Analysis and Linting +Perl-Critic,Perl,Code Analysis and Linting +B-Lint,Perl,Code Analysis and Linting +Perl-Analyzer,Perl,Code Analysis and Linting +Test-Strict,Perl,Testing +Math-BigInt,Perl,Mathematics and Numerics +Test-MockModule,Perl,Testing +Test-Without-Module,Perl,Testing +CLIPSeqTools,Perl,Data Analysis +App-RecordStream,Perl,Data Analysis +Data::Table,Perl,Data Analysis +PDL::Dataframe,Perl,Data Analysis +wxPerl,Perl,Graphical User Interface Design +Perl-Tk,Perl,Graphical User Interface Design +Prima,Perl,Graphical User Interface Design +Perl/KDE,Perl,Graphical User Interface Design +AnyData,Perl,Data Serialization +Data-Format-Pretty-YAML,Perl,Data Serialization +TOML-Tiny,Perl,Data Serialization +CatalystX-Controller-ExtJS-REST-SimpleExcel,Perl,Web Development +Rex,Perl,IT Automation +com.viliussutkus89:SampleLibraryForSonatypePromotionTesting,Perl,IT Automation +Jenkins::API,Perl,IT Automation +Minilla,Perl,IT Automation +@discordjs/collection,TypeScript,Algorithms and Data Structures +js-sdsl,TypeScript,Algorithms and Data Structures +typescript-collections,TypeScript,Algorithms and Data Structures +fast-array-diff,TypeScript,Algorithms and Data Structures +libp2p,TypeScript,Networking and Messaging +@multiformats/multiaddr,TypeScript,Networking and Messaging +@ethersproject/networks,TypeScript,Networking and Messaging +nats,TypeScript,Networking and Messaging +@types/node-schedule,TypeScript,Scheduling and Concurrency +agenda,TypeScript,Scheduling and Concurrency +@nestjs/schedule,TypeScript,Scheduling and Concurrency +@solid-primitives/scheduled,TypeScript,Scheduling and Concurrency +helmet,TypeScript,Security +snyk,TypeScript,Security +express-rate-limit,TypeScript,Web Development +jssha,TypeScript,Security +vite,TypeScript,Web Development +vue-template-compiler,TypeScript,Web Development +@testing-library/user-event,TypeScript,Testing +antd,TypeScript,Graphical User Interface Design +random-js,TypeScript,Mathematics and Numerics +math-expression-evaluator,TypeScript,Mathematics and Numerics +normal-distribution,TypeScript,Mathematics and Numerics +@mathigon/fermat,TypeScript,Mathematics and Numerics +mongodb,TypeScript,Database Management +sequelize,TypeScript,Database Management +firebase,TypeScript,Database Management +typeorm,TypeScript,Database Management +rollup-plugin-dts,TypeScript,Code Analysis and Linting +tsx,TypeScript,Code Analysis and Linting +ts-node-dev,TypeScript,Code Analysis and Linting +serve,TypeScript,Web Development +@oclif/errors,TypeScript,Error Handling +@storybook/addon-console,TypeScript,Graphical User Interface Design +conventional-changelog-writer,TypeScript,IT Automation +git-raw-commits,TypeScript,IT Automation +@codemirror/lint,TypeScript,Code Analysis and Linting +@start/plugin-lib-eslint,TypeScript,Code Analysis and Linting +remark-lint-fenced-code-flag-case,TypeScript,Code Analysis and Linting +tslint-rxjs-subject-restrictions-rule,TypeScript,Code Analysis and Linting +jest,TypeScript,Testing +ts-jest,TypeScript,Testing +babel-jest,TypeScript,Testing +vitest,TypeScript,Testing +data-forge,TypeScript,Data Analysis +vue-component-meta,TypeScript,Graphical User Interface Design +@opticss/element-analysis,TypeScript,Graphical User Interface Design +@antv/l7-scene,TypeScript,Graphical User Interface Design +table,TypeScript,Data Analysis +form-data-encoder,TypeScript,File Handling +ion-js,TypeScript,Web Development +@nsis/language-data,TypeScript,Data Analysis +docker-compose,TypeScript,IT Automation +commitlint-azure-pipelines-cli,TypeScript,IT Automation +azure-devops-node-api,TypeScript,IT Automation +@karmaniverous/get-dotenv,TypeScript,File Handling +happy-dom,TypeScript,Graphical User Interface Design +react-png-tooltip,TypeScript,Graphical User Interface Design +infamous,TypeScript,Graphical User Interface Design +lume,TypeScript,Web Development +github.com/davecgh/go-spew,Go,Algorithms and Data Structures +github.com/google/btree,Go,Algorithms and Data Structures +github.com/lann/ps,Go,Algorithms and Data Structures +github.com/cespare/xxhash/v2,Go,Algorithms and Data Structures +golang.org/x/net,Go,Networking and Messaging +github.com/vishvananda/netns,Go,Networking and Messaging +github.com/nats-io/nats,Go,Networking and Messaging +github.com/jackc/pgproto3/v2,Go,Database Management +k8s.io/kubernetes,Go,IT Automation +github.com/go-co-op/gocron,Go,Scheduling and Concurrency +atomicgo.dev/schedule,Go,Scheduling and Concurrency +github.com/jasonlvhit/gocron,Go,Scheduling and Concurrency +github.com/google/uuid,Go,Algorithms and Data Structures +github.com/golang-jwt/jwt/v4,Go,Security +github.com/microcosm-cc/bluemonday,Go,Security +github.com/99designs/keyring,Go,Security +github.com/gin-gonic/gin,Go,Web Development +github.com/go-redis/cache/v8,Go,Database Management +github.com/gorilla/sessions,Go,Web Development +github.com/labstack/echo/v4,Go,Web Development +gopkg.in/inf.v0,Go,Algorithms and Data Structures +github.com/go-corelibs/maths,Go,Mathematics and Numerics +github.com/go-inf/inf,Go,Algorithms and Data Structures +github.com/pkg/math,Go,Mathematics and Numerics +github.com/go-sql-driver/mysql,Go,Database Management +github.com/lib/pq,Go,Database Management +go.mongodb.org/mongo-driver,Go,Database Management +go.etcd.io/bbolt,Go,Database Management +github.com/pelletier/go-toml/v2,Go,Data Serialization +github.com/joho/godotenv,Go,File Handling +cloud.google.com/go/storage,Go,Database Management +github.com/minio/minio-go/v7,Go,Database Management +github.com/sirupsen/logrus,Go,Logging and Monitoring +go.uber.org/zap,Go,Logging and Monitoring +github.com/go-logr/logr,Go,Logging and Monitoring +go.opentelemetry.io/otel,Go,Logging and Monitoring +golang.org/x/lint,Go,Code Analysis and Linting +github.com/golangci/lint-1,Go,Code Analysis and Linting +github.com/mvdan/lint,Go,Code Analysis and Linting +github.com/golang/lint,Go,Code Analysis and Linting +github.com/stretchr/testify,Go,Testing +github.com/google/go-cmp,Go,Code Analysis and Linting +gopkg.in/check.v1,Go,Testing +github.com/onsi/ginkgo,Go,Testing +github.com/rocketlaunchr/dataframe-go,Go,Data Analysis +github.com/fjukstad/walrus,Go,Algorithms and Data Structures +github.com/hokiegeek/hgtealib,Go,Algorithms and Data Structures +github.com/forchain/bitcoinbigdata,Go,Data Analysis +github.com/google/orderedcode,Go,Code Analysis and Linting +github.com/ipfs/go-block-format,Go,File Handling +github.com/linkedin/goavro/v2,Go,Data Serialization +github.com/minio/sio,Go,File Handling +github.com/power-devops/perfstat,Go,Logging and Monitoring +github.com/gruntwork-io/terratest,Go,Testing +go.mozilla.org/sops/v3,Go,Security +github.com/vladimirvivien/gexe,Go,Algorithms and Data Structures +qtypes,Go,Algorithms and Data Structures +github.com/ctessum/gobra,Go,Algorithms and Data Structures +github.com/yogischogi/ui2go,Go,Graphical User Interface Design +github.com/bhojpur/gui,Go,Graphical User Interface Design diff --git a/transforms/code/semantic_profiler/python/src/ikb/null_libs.csv b/transforms/code/semantic_profiler/python/src/ikb/null_libs.csv new file mode 100644 index 000000000..e69de29bb diff --git a/transforms/code/semantic_profiler/python/src/offline_path/generate_ikb.py b/transforms/code/semantic_profiler/python/src/offline_path/generate_ikb.py new file mode 100644 index 000000000..eb966a77c --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/offline_path/generate_ikb.py @@ -0,0 +1,123 @@ +import os +import argparse +import csv +import pyarrow as pa +import pyarrow.csv as pv +from io import StringIO,BytesIO +from watsonxai import generateResponseWatsonx + + +def getStringFromCSV(file): + table = pv.read_csv(file) + csv_buffer = StringIO() + column_names = table.column_names + csv_buffer.write(','.join(column_names) + '\n') + for row in range(table.num_rows): + row_data = [str(table[column][row].as_py()) for column in column_names] + csv_buffer.write(','.join(row_data) + '\n') + return csv_buffer.getvalue() + + + +def gen_combined_strings(file_data): + file_data = file_data.splitlines() + headers = file_data[0] + null_libraries = file_data[1:] + combined_strings = [] + combined_string = "" + for idx, entry in enumerate(null_libraries, start=1): + if combined_string == "": + combined_string += f"{headers.strip()}\n" + combined_string += f"{entry}\n" + if idx % 30 == 0 or idx == len(null_libraries): + combined_strings.append(combined_string) + combined_string = "" + return combined_strings + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Generate IKB.') + parser.add_argument('--null_libs_file', type=str, help='Path to null libraries file.', default=os.getenv('NULL_LIBS_FILE', '../ikb/null_libs.csv')) + parser.add_argument('--cmap_file', type=str, help='Path to concept map file.', default=os.getenv('CMAP_FILE', '../concept_map/updated_concept_list.csv')) + parser.add_argument('--input_examples_file', type=str, help='Path to input examples file.', default=os.getenv('EXAMPLES_I_FILE', '../examples/examples-i.csv')) + parser.add_argument('--output_examples_file', type=str, help='Path to output examples file.', default=os.getenv('EXAMPLES_O_FILE', '../examples/examples-o.csv')) + parser.add_argument('--extracted_data_file', type=str, help='Path to file in which LLM output will be stored.', default=os.getenv('EXTRACTED_DATA_FILE', '../ikb/extracted_data.csv')) + parser.add_argument('--api_type', type=str, help='API Type', default=os.getenv('API_TYPE', 'WatsonxAI')) + parser.add_argument('--api_key', type=str, help='API key', default=os.getenv('API_KEY', '')) + parser.add_argument('--api_endpoint', type=str, help='API endpoint', default=os.getenv('API_ENDPOINT', 'https://us-south.ml.cloud.ibm.com')) + parser.add_argument('--project_id', type=str, help='Project ID', default=os.getenv('PROJECT_ID', '')) + parser.add_argument('--model_id', type=str, help='LLM model ID', default=os.getenv('MODEL_ID', 'meta-llama/llama-3-70b-instruct')) + + + + args = parser.parse_args() + concepts = getStringFromCSV(args.cmap_file) + input_examples = getStringFromCSV(args.input_examples_file) + output_examples = getStringFromCSV(args.output_examples_file) + + null_libs_file_data = getStringFromCSV(args.null_libs_file) + combined_strings = gen_combined_strings(null_libs_file_data) + + endtoken = "" + prompt_name = "My-prompt" + prompt_template = '''You are responsible for classifying programming language packages based on their functionality into one of the following STRICT categories: + ''' + concepts + ''' + + Instructions: + + 1. Input: A CSV containing two columns: + a. Library – the name of the package + b. Language – the programming language of the package + Your task is to append a third column called Category where you will classify the package's primary function into one of the following categories.\n + + 2. Output: The updated CSV with the new Category column. + + 3. Categorization Guidelines: + a. Classify each package based on its primary functionality. + b. Only use categories from the given list. Do not invent or modify categories. + + 4. Output format: Provide the updated CSV data in the exact format as shown below: + a. Columns: Library, Language, Category + b. End the response with to indicate completion. + c. Do not include any double quotes in the output. + + 5. Only use categories from the given list. Do not invent or modify categories. + + 6. Strictly do not provide any explanations or commentary or notes before and/or after the table. + + Examples: + INPUT: + ''' + str(input_examples) + "OUTPUT:\n" + str(output_examples).strip("\n")+"\n" + + headers = ["Library", "Language", "Category"] + file_exists = os.path.exists(args.extracted_data_file) + if not file_exists: + with open(args.extracted_data_file, mode='w', newline='') as f: + csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') + csv_writer.writerow(headers) + + + for combined_string in combined_strings: + input_template = prompt_template + f"\n\nINPUT: {combined_string} \nOUTPUT: " + if args.api_type == 'WatsonxAI': + response = generateResponseWatsonx(args.api_key, args.api_endpoint, args.model_id, args.project_id, input_template) + data = response.split(endtoken)[0] + csv_file = BytesIO(data.strip().encode('utf-8')) + csv_content = data.splitlines() + not_first_row = 0 + with open(args.extracted_data_file, mode='a', newline='') as f: + csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') + for line in csv_content: + if not_first_row: + row = line.split(',') + csv_writer.writerow(row) + not_first_row = 1 + + + + + + + + diff --git a/transforms/code/semantic_profiler/python/src/offline_path/watsonxai.py b/transforms/code/semantic_profiler/python/src/offline_path/watsonxai.py new file mode 100644 index 000000000..e346c1894 --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/offline_path/watsonxai.py @@ -0,0 +1,25 @@ +from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams +from ibm_watsonx_ai.foundation_models import ModelInference +from ibm_watsonx_ai import Credentials + + + + +def generateResponseWatsonx(api_key, api_endpoint, model_id, project_id, input_template): + credentials = Credentials(api_key=api_key, url=api_endpoint) + parameters = { + GenParams.DECODING_METHOD: "greedy", + GenParams.MAX_NEW_TOKENS: 100, + GenParams.STOP_SEQUENCES: [""] + } + model = ModelInference( + model_id=model_id, + params=parameters, + credentials=credentials, + project_id=project_id) + response = model.generate_text(input_template) + return response + + + + diff --git a/transforms/code/semantic_profiler/python/src/sp_helper.py b/transforms/code/semantic_profiler/python/src/sp_helper.py new file mode 100644 index 000000000..3986abda4 --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/sp_helper.py @@ -0,0 +1,91 @@ +import pyarrow.csv as pacsv +import csv + + + +class TrieNode: + ''' + Implements one node of a Trie datastructure + ''' + def __init__(self): + self.children = {} + self.is_end_of_word = False + self.data = None + +class Trie: + ''' + Implements a Trie datastructure for efficient retrieval of concepts from the IKB. + ''' + def __init__(self): + self.root = TrieNode() + + def insert(self, library_name, programming_language, functionality): + node = self.root + for char in library_name: + if char not in node.children: + node.children[char] = TrieNode() + node = node.children[char] + node.data = {} + node.data['Category'] = functionality + node.data['Language'] = programming_language + node.is_end_of_word = True + + def search(self, library_name, programming_language): + node = self.root + for char in library_name: + if char not in node.children: + return None + node = node.children[char] + if node.is_end_of_word and node.data: + return node.data + return None + + +class knowledge_base: + ''' + Implements the internal knowledge base. + ''' + knowledge_base_file = '' + null_file = '' + knowledge_base_table = None + knowledge_base_trie = None + entries_with_null_coverage = set() + + def __init__(self, ikb_file, null_libs_file): + self.knowledge_base_file = ikb_file + self.null_file = null_libs_file + + def load_ikb_trie(self): + self.knowledge_base_table = pacsv.read_csv(self.knowledge_base_file) + self.knowledge_base_trie = Trie() + library_column = self.knowledge_base_table.column('Library').to_pylist() + language_column = self.knowledge_base_table.column('Language').to_pylist() + category_column = self.knowledge_base_table.column('Category').to_pylist() + for library, language, category in zip(library_column, language_column, category_column): + self.knowledge_base_trie.insert(str.lower(library), language, category) + + def write_null_files(self): + with open(self.null_file, 'a+', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + for entry in self.entries_with_null_coverage: + writer.writerow([entry[0], entry[1]]) + self.entries_with_null_coverage = set() + + +def concept_extractor(libraries,language,ikb): + ''' + Given a set of libraries and the corresponding programming language along with the IKB trie, this function + returns the matching concept(s) as a comma separated list joined into a string. + ''' + concept_coverage = set() + language = language + libraries = [item.strip() for item in libraries.split(",")] + for library in libraries: + if library: + extracted_base_name = str.lower(library) + matched_entry = ikb.knowledge_base_trie.search(extracted_base_name, language) + if matched_entry: + concept_coverage.add(matched_entry['Category'].strip()) + else: + ikb.entries_with_null_coverage.add((library,language)) + return ','.join(sorted(list(concept_coverage))) \ No newline at end of file diff --git a/transforms/code/semantic_profiler/python/src/sp_local.py b/transforms/code/semantic_profiler/python/src/sp_local.py new file mode 100644 index 000000000..33a863f7a --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/sp_local.py @@ -0,0 +1,35 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.data_access import DataAccessLocal +from sp_transform import SemanticProfilerTransform + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) + +sp_params = {"ikb_file": "src/ikb/ikb_model.csv", "null_libs_file": "src/ikb/null_libs.csv"} + +if __name__ == "__main__": + # Here we show how to run outside of the runtime + # Create and configure the transform. + transform = SemanticProfilerTransform(sp_params) + # Use the local data access to read a parquet table. + data_access = DataAccessLocal() + table, _ = data_access.get_table(os.path.join(input_folder, "test.parquet")) + print(f"input table: {table}") + # Transform the table + table_list, metadata = transform.transform(table) + print(f"\noutput table: {table_list}") + print(f"output metadata : {metadata}") diff --git a/transforms/code/semantic_profiler/python/src/sp_local_python.py b/transforms/code/semantic_profiler/python/src/sp_local_python.py new file mode 100644 index 000000000..be468d761 --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/sp_local_python.py @@ -0,0 +1,47 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from sp_transform_python import SemanticProfilerPythonTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # sp params + "sp_ikb_file": "src/ikb/ikb_model.csv", + "sp_null_libs_file": "src/ikb/null_libs.csv" + +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=SemanticProfilerPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/code/semantic_profiler/python/src/sp_local_python_multiprocessor.py b/transforms/code/semantic_profiler/python/src/sp_local_python_multiprocessor.py new file mode 100644 index 000000000..607c42770 --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/sp_local_python_multiprocessor.py @@ -0,0 +1,47 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from sp_transform_python import SemanticProfilerPythonTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # "runtime_num_processors": 2, + # sp params + "sp_ikb_file": "src/ikb/ikb_model.csv", + "sp_null_libs_file": "src/ikb/null_libs.csv" +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=SemanticProfilerPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/code/semantic_profiler/python/src/sp_transform.py b/transforms/code/semantic_profiler/python/src/sp_transform.py new file mode 100644 index 000000000..9069ca805 --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/sp_transform.py @@ -0,0 +1,124 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + +from argparse import ArgumentParser, Namespace +from typing import Any + +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider +from sp_helper import * + + +short_name = "sp" +cli_prefix = f"{short_name}_" + +ikb_file = "ikb_file" +null_libs_file = "null_libs_file" + +ikb_file_cli_param = f"{cli_prefix}{ikb_file}" +null_libs_file_cli_param = f"{cli_prefix}{null_libs_file}" + + + +class SemanticProfilerTransform(AbstractTableTransform): + """ + Implements the semantic profiler transform on a pyarrow table + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, SemanticProfilerTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + # Make sure that the param name corresponds to the name used in apply_input_params method + # of SemanticProfilerTransformConfiguration class + super().__init__(config) + self.ikb_file = config.get("ikb_file", "../src/ikb/ikb_model.csv") + self.null_libs_file = config.get("null_libs_file", "../src/ikb/null_libs.csv") + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + Put Transform-specific to convert one Table to 0 or more tables. It also returns + a dictionary of execution statistics - arbitrary dictionary + This implementation takes a pyarrow table (ouput of the USBR transform) as input and obtains the + semantic mapping of each datapoint from the Internal Knowledge Base. These semantic concepts are added as + a new column into the input table and returned as output. The points for which no semantic mapping is found are + written into the "null_libs.csv" file. + """ + self.logger.debug(f"Transforming one table with {len(table)} rows") + ikb = knowledge_base(self.ikb_file, self.null_libs_file) + ikb.load_ikb_trie() + libraries = table.column('Library').to_pylist() + language = table.column('Language').to_pylist() + concepts = [concept_extractor(lib, lang, ikb) for lib, lang in zip(libraries, language)] + new_col = pa.array(concepts) + table = table.append_column('Concepts', new_col) + ikb.write_null_files() + # Add some sample metadata. + self.logger.debug(f"Transformed one table with {len(table)} rows") + metadata = {"nfiles": 1, "nrows": len(table)} + return [table], metadata + + +class SemanticProfilerTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=SemanticProfilerTransform, + ) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__) + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the SemanticProfilerTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, sp_, pii_, etc.) + """ + + parser.add_argument( + f"--{ikb_file_cli_param}", + type=str, + default=None, + help="Default IKB file", + ) + + parser.add_argument( + f"--{null_libs_file_cli_param}", + type=str, + default=None, + help="Default Null Libraries file", + ) + + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"sp parameters are : {self.params}") + return True diff --git a/transforms/code/semantic_profiler/python/src/sp_transform_python.py b/transforms/code/semantic_profiler/python/src/sp_transform_python.py new file mode 100644 index 000000000..d35328364 --- /dev/null +++ b/transforms/code/semantic_profiler/python/src/sp_transform_python.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from sp_transform import SemanticProfilerTransformConfiguration + + +logger = get_logger(__name__) + + +class SemanticProfilerPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for SemanticProfiler as required by the PythonTransformLauncher. + SemanticProfiler does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SemanticProfilerTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = SemanticProfilerRayLauncher() + launcher = PythonTransformLauncher(SemanticProfilerPythonTransformConfiguration()) + logger.info("Launching sp transform") + launcher.launch() diff --git a/transforms/code/semantic_profiler/python/test-data/expected/metadata.json b/transforms/code/semantic_profiler/python/test-data/expected/metadata.json new file mode 100644 index 000000000..8797e64fa --- /dev/null +++ b/transforms/code/semantic_profiler/python/test-data/expected/metadata.json @@ -0,0 +1,46 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "SemanticProfiler", + "job type": "ray", + "job id": "job_id", + "start_time": "2024-03-01 15:17:56", + "end_time": "2024-03-01 15:17:57", + "status": "success" + }, + "code": [null], + "job_input_params": { + "sleep": 0, + "checkpointing": false, + "max_files": -1, + "number of workers": 1, + "worker options": { + "num_cpus": 0.8 + }, + "actor creation delay": 0 + }, + "execution_stats": { + "cpus": 10, + "gpus": 0, + "memory": 14.031964112073183, + "object_store": 2.0 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 16534, + "result_files": 1, + "result_size": 16534, + "table_processing": 0.012392997741699219, + "nfiles": 1, + "nrows": 5 + }, + "source": { + "name": "test-data/data_processing/ray/sp/input", + "type": "path" + }, + "target": { + "name": "/tmp/SP4o9gv2bq", + "type": "path" + } +} diff --git a/transforms/code/semantic_profiler/python/test-data/expected/test.parquet b/transforms/code/semantic_profiler/python/test-data/expected/test.parquet new file mode 100644 index 000000000..748db85ba Binary files /dev/null and b/transforms/code/semantic_profiler/python/test-data/expected/test.parquet differ diff --git a/transforms/code/semantic_profiler/python/test-data/input/test.parquet b/transforms/code/semantic_profiler/python/test-data/input/test.parquet new file mode 100644 index 000000000..f9ac1f024 Binary files /dev/null and b/transforms/code/semantic_profiler/python/test-data/input/test.parquet differ diff --git a/transforms/code/semantic_profiler/python/test/test_sp.py b/transforms/code/semantic_profiler/python/test/test_sp.py new file mode 100644 index 000000000..172858945 --- /dev/null +++ b/transforms/code/semantic_profiler/python/test/test_sp.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os + +import pyarrow as pa +from data_processing.test_support import get_tables_in_folder +from data_processing.test_support.transform.table_transform_test import ( + AbstractTableTransformTest, +) +from sp_transform import SemanticProfilerTransform, null_libs_file, ikb_file + + +# table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) +# expected_table = table # We're a sp after all. +# expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result + + +class TestSemanticProfilerTransform(AbstractTableTransformTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + input_tables = get_tables_in_folder(input_dir) + expected_tables = get_tables_in_folder(expected_dir) + + expected_metadata_list = [{"nfiles": 1, "nrows": len(expected_tables[0])}, {}] + config = {ikb_file: os.path.join(src_file_dir,"../src/ikb/ikb_model.csv"), null_libs_file: os.path.join(src_file_dir,"../src/ikb/null_libs.csv")} + fixtures = [ + (SemanticProfilerTransform(config), input_tables, expected_tables, expected_metadata_list), + ] + return fixtures diff --git a/transforms/code/semantic_profiler/python/test/test_sp_python.py b/transforms/code/semantic_profiler/python/test/test_sp_python.py new file mode 100644 index 000000000..191978dbc --- /dev/null +++ b/transforms/code/semantic_profiler/python/test/test_sp_python.py @@ -0,0 +1,48 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from sp_transform import ikb_file_cli_param, null_libs_file_cli_param +from sp_transform_python import SemanticProfilerPythonTransformConfiguration + + +class TestPythonSemanticProfilerTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + + launcher = PythonTransformLauncher(SemanticProfilerPythonTransformConfiguration()) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + + transform_config = {ikb_file_cli_param: os.path.join(src_file_dir, "../src/ikb/ikb_model.csv"), null_libs_file_cli_param: os.path.join(src_file_dir, "../src/ikb/null_libs.csv")} + fixtures.append( + ( + launcher, + transform_config, + input_dir, + expected_dir, + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + + return fixtures diff --git a/transforms/code/semantic_profiler/ray/.dockerignore b/transforms/code/semantic_profiler/ray/.dockerignore new file mode 100644 index 000000000..f7275bbbd --- /dev/null +++ b/transforms/code/semantic_profiler/ray/.dockerignore @@ -0,0 +1 @@ +venv/ diff --git a/transforms/code/semantic_profiler/ray/.gitignore b/transforms/code/semantic_profiler/ray/.gitignore new file mode 100644 index 000000000..3ea7fd4ab --- /dev/null +++ b/transforms/code/semantic_profiler/ray/.gitignore @@ -0,0 +1,38 @@ +test-data/output +output/* +/output/ +data-processing-lib/ + + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + + +# Distribution / packaging +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +htmlcov +.coverage +.cache +nosetests.xml +coverage.xml \ No newline at end of file diff --git a/transforms/code/semantic_profiler/ray/Dockerfile b/transforms/code/semantic_profiler/ray/Dockerfile new file mode 100644 index 000000000..df7d2a5a4 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/Dockerfile @@ -0,0 +1,42 @@ +ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 +FROM ${BASE_IMAGE} + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . +COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ +RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . +COPY --chown=ray:users python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . + +#COPY requirements.txt requirements.txt +#RUN pip install --no-cache-dir -r requirements.txt + +COPY --chown=ray:users src/ src/ +COPY --chown=ray:users pyproject.toml pyproject.toml +RUN pip install --no-cache-dir -e . + +# copy the main() entry point to the image +COPY ./src/sp_transform_ray.py . + +# copy some of the samples in +COPY ./src/sp_local_ray.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/ray + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/code/semantic_profiler/ray/Makefile b/transforms/code/semantic_profiler/ray/Makefile new file mode 100644 index 000000000..c4ddf5f0a --- /dev/null +++ b/transforms/code/semantic_profiler/ray/Makefile @@ -0,0 +1,58 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. + +include $(REPOROOT)/transforms/.make.transforms + +TRANSFORM_NAME=sp + +BASE_IMAGE=${RAY_BASE_IMAGE} +venv:: .transforms.ray-venv + +test:: .transforms.ray-test + +clean:: .transforms.clean + +image:: .transforms.ray-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +test-image:: .transforms.ray-test-image + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-ray + +setup:: .transforms.setup + +# set the version of python transform that this depends on. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=${SP_PYTHON_VERSION} TOML_VERSION=$(SP_RAY_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +# Ensure RUN_ARGS has a default value +RUN_ARGS ?= "" + +run-cli-sample: .transforms.run-cli-ray-sample + +run-local-sample: .transforms.run-local-ray-sample + +# run-s3-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/semantic_profiler/ray/README.md b/transforms/code/semantic_profiler/ray/README.md new file mode 100644 index 000000000..a6a10810c --- /dev/null +++ b/transforms/code/semantic_profiler/ray/README.md @@ -0,0 +1,45 @@ +# SP Ray Transform +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +This project wraps the [sp transform](../python) with a Ray runtime. + +## Configuration and command line Options + +SP configuration and command line options are the same as for the [base python](../python) transform. + +## Running + +### Launched Command Line Options +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. + +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/sp_transform.py using command line args +* `run-local-sample` - runs src/sp_local_ray.py + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/semantic_profiler/ray/pyproject.toml b/transforms/code/semantic_profiler/ray/pyproject.toml new file mode 100644 index 000000000..e175be027 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/pyproject.toml @@ -0,0 +1,45 @@ +[project] +name = "dpk_sp_transform_ray" +version = "0.2.1.dev0" +requires-python = ">=3.10" +description = "SP Ray Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Aishwariya Chakraborty", email = "aishwariya.chakraborty1@ibm.com" }, +] +dependencies = [ + "dpk-sp-transform-python==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev0", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/code/semantic_profiler/ray/src/concept_map/updated_concept_list.csv b/transforms/code/semantic_profiler/ray/src/concept_map/updated_concept_list.csv new file mode 100644 index 000000000..685d62d3d --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/concept_map/updated_concept_list.csv @@ -0,0 +1,14 @@ +Category +Algorithms and Data Structures +Database Management +File Handling +Networking and Messaging +Graphical User Interface Design +Security +Scheduling and Concurrency +Logging and Monitoring +Web Development +Mathematics and Numerics +Code Analysis and Linting +Testing +Data Serialization \ No newline at end of file diff --git a/transforms/code/semantic_profiler/ray/src/examples/examples-i.csv b/transforms/code/semantic_profiler/ray/src/examples/examples-i.csv new file mode 100644 index 000000000..639735b66 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/examples/examples-i.csv @@ -0,0 +1,27 @@ +Library,Language +algorithms,Python +asyncio,Python +arrow,Python +authlib,Python +webassets,Python +scipy,Python +pymysql,Python +mimetypes,Python +logging,Python +flake8,Python +mamba,Python +marshmallow,Python +tkinter,Python +com.leansoft.bigqueue,Java +com.cisco.commons.networking,Java +net.time4j,Java +org.apache.shiro,Java +java.net.http,Java +org.apache.commons.math4,Java +ch.vorburger.mariaDB4j,Java +com.google.jimfs,Java +java.logging,Java +org.sonar,Java +org.junit,Java +com.cedarsoftware:json-io,Java +java.desktop,Java diff --git a/transforms/code/semantic_profiler/ray/src/examples/examples-o.csv b/transforms/code/semantic_profiler/ray/src/examples/examples-o.csv new file mode 100644 index 000000000..b7eb9397a --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/examples/examples-o.csv @@ -0,0 +1,27 @@ +Library,Language,Category +algorithms,Python,Algorithms and Data Structures +asyncio,Python,Networking and Messaging +arrow,Python,Scheduling and Concurrency +authlib,Python,Security +webassets,Python,Web Development +scipy,Python,Mathematics and Numerics +pymysql,Python,Database Management +mimetypes,Python,File Handling +logging,Python,Logging and Monitoring +flake8,Python,Code Analysis and Linting +mamba,Python,Testing +marshmallow,Python,Data Serialization +tkinter,Python,Graphical User Interface Design +com.leansoft.bigqueue,Java,Algorithms and Data Structures +com.cisco.commons.networking,Java,Networking and Messaging +net.time4j,Java,Scheduling and Concurrency +org.apache.shiro,Java,Security +java.net.http,Java,Web Development +org.apache.commons.math4,Java,Mathematics and Numerics +ch.vorburger.mariaDB4j,Java,Database Management +com.google.jimfs,Java,File Handling +java.logging,Java,Logging and Monitoring +org.sonar,Java,Code Analysis and Linting +org.junit,Java,Testing +com.cedarsoftware:json-io,Java,Data Serialization +java.desktop,Java,Graphical User Interface Design \ No newline at end of file diff --git a/transforms/code/semantic_profiler/ray/src/ikb/extracted_data.csv b/transforms/code/semantic_profiler/ray/src/ikb/extracted_data.csv new file mode 100644 index 000000000..6fda787c8 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/ikb/extracted_data.csv @@ -0,0 +1,39 @@ +dynamic_bitset,Cpp,Algorithms and Data Structures +tries,Cpp,Algorithms and Data Structures +algorithm,Cpp,Algorithms and Data Structures +uni-algo,Cpp,Algorithms and Data Structures +boost.asio,Cpp,Networking and Messaging +cpp-netlib,Cpp,Networking and Messaging +zmq,Cpp,Networking and Messaging +azmq,Cpp,Networking and Messaging +thread-pool,Cpp,Scheduling and Concurrency +chrono,Cpp,Scheduling and Concurrency +concurrencpp,Cpp,Scheduling and Concurrency +time,Cpp,Scheduling and Concurrency +libressl,Cpp,Security +"dynamic_bitset","Cpp","Algorithms and Data Structures" +"tries","Cpp","Algorithms and Data Structures" +"algorithm","Cpp","Algorithms and Data Structures" +"uni-algo","Cpp","Algorithms and Data Structures" +"boost.asio","Cpp","Networking and Messaging" +"cpp-netlib","Cpp","Networking and Messaging" +"zmq","Cpp","Networking and Messaging" +"azmq","Cpp","Networking and Messaging" +"thread-pool","Cpp","Scheduling and Concurrency" +"chrono","Cpp","Scheduling and Concurrency" +"concurrencpp","Cpp","Scheduling and Concurrency" +"time","Cpp","Scheduling and Concurrency" +"libressl","Cpp","Security" +"dynamic_bitset","Cpp","Algorithms and Data Structures" +"tries","Cpp","Algorithms and Data Structures" +"algorithm","Cpp","Algorithms and Data Structures" +"uni-algo","Cpp","Algorithms and Data Structures" +"boost.asio","Cpp","Networking and Messaging" +"cpp-netlib","Cpp","Networking and Messaging" +"zmq","Cpp","Networking and Messaging" +"azmq","Cpp","Networking and Messaging" +"thread-pool","Cpp","Scheduling and Concurrency" +"chrono","Cpp","Scheduling and Concurrency" +"concurrencpp","Cpp","Scheduling and Concurrency" +"time","Cpp","Scheduling and Concurrency" +"libressl","Cpp","Security" diff --git a/transforms/code/semantic_profiler/ray/src/ikb/ikb_model.csv b/transforms/code/semantic_profiler/ray/src/ikb/ikb_model.csv new file mode 100644 index 000000000..bda9d2a66 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/ikb/ikb_model.csv @@ -0,0 +1,1021 @@ +Library,Language,Category +dynamic_bitset,Cpp,Algorithms and Data Structures +tries,Cpp,Algorithms and Data Structures +algorithm,Cpp,Algorithms and Data Structures +uni-algo,Cpp,Algorithms and Data Structures +boost.asio,Cpp,Networking and Messaging +cpp-netlib,Cpp,Networking and Messaging +zmq,Cpp,Networking and Messaging +azmq,Cpp,Networking and Messaging +thread-pool,Cpp,Scheduling and Concurrency +chrono,Cpp,Scheduling and Concurrency +concurrencpp,Cpp,Scheduling and Concurrency +time,Cpp,Scheduling and Concurrency +libressl,Cpp,Security +libgcrypt,Cpp,Security +nettle,Cpp,Security +digestpp,Cpp,Security +libonion,Cpp,Web Development +cpp-httplib,Cpp,Web Development +jwt-cpp,Cpp,Security +libfv,Cpp,Mathematics and Numerics +blaze,Cpp,Mathematics and Numerics +cnl,Cpp,Mathematics and Numerics +eigen,Cpp,Mathematics and Numerics +linalg,Cpp,Mathematics and Numerics +clickhouse,Cpp,Database Management +leveldb,Cpp,Database Management +libpqxx,Cpp,Database Management +sqlite,Cpp,Database Management +filesystem,Cpp,File Handling +llfio,Cpp,File Handling +glob,Cpp,File Handling +tinydir,Cpp,File Handling +spdlog,Cpp,Logging and Monitoring +boost.log,Cpp,Logging and Monitoring +glog,Cpp,Logging and Monitoring +reckless,Cpp,Algorithms and Data Structures +clang-tidy,Cpp,Code Analysis and Linting +clangd,Cpp,Code Analysis and Linting +cquery,Cpp,Code Analysis and Linting +cppcheck,Cpp,Code Analysis and Linting +boost.test,Cpp,Testing +benchmark,Cpp,Testing +cpputest,Cpp,Testing +ctest,Cpp,Testing +dlib,Cpp,Algorithms and Data Structures +blitz,Cpp,Algorithms and Data Structures +armadillo,Cpp,Algorithms and Data Structures +oneapi/dal,Cpp,Database Management +frozen,Cpp,Data Serialization +glaze,Cpp,Data Serialization +cppcodec,Cpp,Data Serialization +boost.serialization,Cpp,Data Serialization +infra,Cpp,Networking and Messaging +workflow,Cpp,Scheduling and Concurrency +taskflow,Cpp,Scheduling and Concurrency +libthrift,Cpp,Networking and Messaging +cegui,Cpp,Graphical User Interface Design +wxwidgets,Cpp,Graphical User Interface Design +gtk,Cpp,Graphical User Interface Design +nanogui,Cpp,Graphical User Interface Design +com.leansoft.bigqueue,Java,Algorithms and Data Structures +com.liveramp.hyperminhash,Java,Algorithms and Data Structures +org.pcollections,Java,Algorithms and Data Structures +org.ojalgo,Java,Algorithms and Data Structures +com.cisco.commons.networking,Java,Networking and Messaging +io.netty,Java,Networking and Messaging +org.apache.kafka,Java,Networking and Messaging +com.rabbitmq,Java,Networking and Messaging +net.time4j,Java,Scheduling and Concurrency +org.jobrunr:jobrunr,Java,Scheduling and Concurrency +org.quartz,Java,Scheduling and Concurrency +org.knowm.sundial,Java,Scheduling and Concurrency +org.apache.shiro,Java,Security +org.bouncycastle,Java,Security +jdk.crypto.cryptoki,Java,Security +jdk.security,Java,Security +java.net.http,Java,Web Development +jdk.httpserver,Java,Web Development +io.activej.codegen,Java,Code Analysis and Linting +ninja,Java,Code Analysis and Linting +org.apache.commons.math4,Java,Mathematics and Numerics +org.apache.commons.numbers,Java,Mathematics and Numerics +org.apache.commons.rng,Java,Mathematics and Numerics +com.mathLibrary,Java,Mathematics and Numerics +ch.vorburger.mariaDB4j,Java,Database Management +java.sql,Java,Database Management +redis.clients.jedis,Java,Database Management +org.jooq,Java,Database Management +com.google.jimfs,Java,File Handling +java.io,Java,File Handling +java.nio.file,Java,File Handling +org.apache.commons.vfs2,Java,File Handling +java.logging,Java,Logging and Monitoring +jdk.jconsole,Java,IT Automation +java.util.logging,Java,Logging and Monitoring +org.slf4j.Logger,Java,Logging and Monitoring +org.sonar,Java,Code Analysis and Linting +fr.inria.gforge.spoon,Java,Code Analysis and Linting +com.puppycrawl.tools.checkstyle,Java,Code Analysis and Linting +net.sourceforge.pmd,Java,Code Analysis and Linting +org.junit,Java,Testing +com.intuit.karate,Java,Testing +org.mockito,Java,Testing +org.apache.jmeter,Java,Testing +org.influxdb,Java,Data Analysis +org.apache.spark,Java,Data Analysis +org.apache.flink,Java,Data Analysis +weka,Java,Data Analysis +com.cedarsoftware:json-io,Java,Data Serialization +com.google.flatbuffers,Java,Data Serialization +org.msgpack,Java,Data Serialization +com.esotericsoftware.kryo,Java,Data Serialization +jenkins.model.Jenkins,Java,IT Automation +org.apache.maven,Java,IT Automation +org.gradle,Java,IT Automation +com.microsoft.terraform,Java,IT Automation +java.desktop,Java,Graphical User Interface Design +java.awt,Java,Graphical User Interface Design +org.openjfx,Java,Graphical User Interface Design +org.eclipse.swt,Java,Graphical User Interface Design +ngraph.graph,JavaScript,Algorithms and Data Structures +buckets,JavaScript,Algorithms and Data Structures +mori,JavaScript,Algorithms and Data Structures +graphlib,JavaScript,Algorithms and Data Structures +socket.io,JavaScript,Networking and Messaging +request,JavaScript,Web Development +amqplib,JavaScript,Networking and Messaging +mqtt,JavaScript,Networking and Messaging +fullcalendar,JavaScript,Graphical User Interface Design +later,JavaScript,Scheduling and Concurrency +date-fns,JavaScript,Mathematics and Numerics +Moment,JavaScript,Mathematics and Numerics +helmet,JavaScript,Security +bcrypt,JavaScript,Security +js-xss,JavaScript,Security +xss-filters,JavaScript,Security +vue,JavaScript,Graphical User Interface Design +react,JavaScript,Graphical User Interface Design +express,JavaScript,Web Development +angular,JavaScript,Graphical User Interface Design +Polynomial,JavaScript,Mathematics and Numerics +Numeral-js,JavaScript,Mathematics and Numerics +accounting,JavaScript,Mathematics and Numerics +odometer,JavaScript,Mathematics and Numerics +datavore,JavaScript,Data Analysis +DB,JavaScript,Database Management +sql,JavaScript,Database Management +NeDB,JavaScript,Database Management +jStorage,JavaScript,Database Management +store,JavaScript,Database Management +cross-storage,JavaScript,File Handling +localForage,JavaScript,File Handling +console.log-wrapper,JavaScript,Logging and Monitoring +storybook,JavaScript,Graphical User Interface Design +minilog,JavaScript,Logging and Monitoring +loglevel,JavaScript,Logging and Monitoring +eslint,JavaScript,Code Analysis and Linting +jshint,JavaScript,Code Analysis and Linting +tslint,JavaScript,Code Analysis and Linting +sonarqube,JavaScript,Code Analysis and Linting +jest,JavaScript,Testing +Cypress,JavaScript,Testing +jasmine,JavaScript,Testing +qunit,JavaScript,Testing +fabric,JavaScript,Web Development +d3,JavaScript,Graphical User Interface Design +three,JavaScript,Graphical User Interface Design +sigma,JavaScript,Graphical User Interface Design +tempo,JavaScript,Graphical User Interface Design +jsfmt,JavaScript,Data Serialization +fecha,JavaScript,Data Serialization +protobufjs,JavaScript,Data Serialization +shelljs,JavaScript,IT Automation +forever,JavaScript,Scheduling and Concurrency +node-cron,JavaScript,Scheduling and Concurrency +jenkins,JavaScript,IT Automation +react,JavaScript,Web Development +vue,JavaScript,Web Development +electron,JavaScript,Web Development +angular,JavaScript,Web Development +stdgpu,C,Algorithms and Data Structures +urdfdom,C,Algorithms and Data Structures +cxxgraph,C,Algorithms and Data Structures +metis,C,Algorithms and Data Structures +nanomsg,C,Networking and Messaging +curl,C,Web Development +librabbitmq,C,Networking and Messaging +mosquitto,C,Networking and Messaging +uv,C,Scheduling and Concurrency +time,C,Scheduling and Concurrency +pth,C,Scheduling and Concurrency +pthread,C,Scheduling and Concurrency +OpenSSL,C,Security +GnuTLS,C,Security +libsodium,C,Security +libgcrypt,C,Security +facil.io,C,File Handling +kcgi,C,Web Development +KLone,C,Web Development +civetweb,C,Web Development +apophenia,C,Data Analysis +cmathl,C,Mathematics and Numerics +GSL,C,Mathematics and Numerics +SLEPc,C,Mathematics and Numerics +DuckDB,C,Database Management +MySQL,C,Database Management +sophia,C,Database Management +SQLite,C,Database Management +stdio,C,File Handling +POSIX,C,IT Automation +HDF5,C,File Handling +fstream,C,File Handling +syslog,C,Logging and Monitoring +spdlog,C,Logging and Monitoring +collectd,C,Data Analysis +nagios-plugins,C,IT Automation +libclang,C,Code Analysis and Linting +Cppcheck,C,Code Analysis and Linting +libclang-tidy,C,Code Analysis and Linting +Infer,C,Code Analysis and Linting +CMocka,C,Testing +MinUnit,C,Testing +Valgrind,C,Testing +Check,C,Testing +gsl-lite,C,Mathematics and Numerics +libcsv,C,Data Analysis +dataframe,C,Data Analysis +iqa,C,Data Analysis +libyaml,C,Data Serialization +libfmt,C,Data Serialization +flatbuffers,C,Data Serialization +msgpack-c,C,Data Serialization +nix_api_util,C,IT Automation +libcircmetrics,C,Logging and Monitoring +etcd-api,C,Networking and Messaging +cetcd,C,Networking and Messaging +microui,C,Graphical User Interface Design +tinyfiledialogs,C,Graphical User Interface Design +luigi ,C,IT Automation +GTK,C,Graphical User Interface Design +Akade.IndexedSet,C#,Algorithms and Data Structures +Akka.DistributedData,C#,Algorithms and Data Structures +dotnet-mgcb-compute,C#,Mathematics and Numerics +QuantConnect.Algorithm.CSharp,C#,Algorithms and Data Structures +Microsoft.AspNetCore.Connections,C#,Networking and Messaging +System.Net.Http.WinHttpHandler,C#,Web Development +Microsoft.AspNetCore.WebUtilities,C#,Web Development +MessagePipe,C#,Networking and Messaging +Microsoft.SemanticKernel.Plugins.MsGraph,C#,Algorithms and Data Structures +System.Threading.Tasks,C#,Scheduling and Concurrency +Hangfire,C#,Scheduling and Concurrency +OrchardCore.PublishLater,C#,Scheduling and Concurrency +CefSharp.WinForm.Net.Core,C#,Graphical User Interface Design +System.DirectoryServices.AccountManagement,C#,IT Automation +System.Security.Permissions,C#,Security +System.Security.AccessControl,C#,Security +@pavelsavara/dotnet-runtime,C#,IT Automation +@abp/ng.oauth,C#,Security +@abp/core,C#,Web Development +@abp/ng.components,C#,Web Development +SharpDX.Mathematics,C#,Mathematics and Numerics +AvaloniaMath,C#,Mathematics and Numerics +WpfMath,C#,Mathematics and Numerics +NCalcSync,C#,Mathematics and Numerics +microsoft.entityframeworkcore.tools,C#,Database Management +Dapper,C#,Database Management +Microsoft.Azure.Management.PostgreSQL,C#,Database Management +Microsoft.Azure.Management.CosmosDB,C#,Database Management +Reloaded.Mod.Loader.IO,C#,File Handling +DICOMcloud,C#,Data Analysis +Aurio,C#,Graphical User Interface Design +SeekableS3Stream,C#,File Handling +Microsoft.Extensions.Logging,C#,Logging and Monitoring +Microsoft.Azure.Management.Profiles.hybrid_2019_03_01.Monitor,C#,IT Automation +Azure.Monitor.OpenTelemetry.AspNetCore,C#,Logging and Monitoring +Microsoft.AspNetCore.Identity,C#,Security +roslyn,C#,Code Analysis and Linting +Microsoft.Toolkit.Uwp.PlatformSpecificAnalyzer,C#,Code Analysis and Linting +Uno.Microsoft.Toolkit.Uwp.PlatformSpecificAnalyzer,C#,Code Analysis and Linting +Microsoft.CST.ApplicationInspector.Common,C#,Code Analysis and Linting +Microsoft.AspNetCore.TestHost,C#,Testing +Microsoft.AspNetCore.Mvc.Testing,C#,Testing +Microsoft.AspNetCore.SignalR.Specification.Tests,C#,Testing +KIF,C#,Algorithms and Data Structures +Microsoft.Data.Analysis,C#,Data Analysis +Azure.Media.VideoAnalyzer.Edge,C#,Data Analysis +Google.Cloud.Trace.V1,C#,Logging and Monitoring +ClosedXML.Report,C#,Data Serialization +System.Formats,C#,Data Serialization +System.IO.Ports,C#,File Handling +System.Text.Json,C#,Data Serialization +App.Metrics.Formatters.Graphite,C#,Logging and Monitoring +Microsoft.Crank.AzureDevOpsWorker,C#,IT Automation +AWSSDK.DevOpsGuru,C#,IT Automation +Microsoft.SourceLink.AzureDevOpsServer.Git,C#,IT Automation +Saritasa.Tools.Messages.TestRuns,C#,Testing +SSRD.IdentityUI,C#,Security +bashforms,C#,Graphical User Interface Design +NSCI,C#,Algorithms and Data Structures +WSCT.GUI,C#,Graphical User Interface Design +lock-free,D,Algorithms and Data Structures +liblfdsd,D,Algorithms and Data Structures +bitranged,D,Algorithms and Data Structures +dstruct,D,Algorithms and Data Structures +vibe-d,D,Web Development +hunt-net,D,Networking and Messaging +nbuff,D,Algorithms and Data Structures +collie,D,Algorithms and Data Structures +photon,D,Algorithms and Data Structures +scheduled,D,Scheduling and Concurrency +meta,D,Code Analysis and Linting +ctini,D,Security +hunt-security,D,Security +hunt-shiro,D,Security +secured,D,Security +csprng,D,Security +pgator-backend,D,Web Development +hunt-cache,D,Data Analysis +formoshlep,D,Data Analysis +web-config,D,Web Development +simple-math,D,Mathematics and Numerics +evalex,D,Mathematics and Numerics +dualnumbers,D,Mathematics and Numerics +tau,D,Mathematics and Numerics +mysql-native,D,Database Management +derelict-pq,D,Database Management +ddbc,D,Database Management +dpq2,D,Database Management +inifiled,D,File Handling +fswatch,D,File Handling +tinyfiledialogs,D,Graphical User Interface Design +thepath,D,File Handling +hunt,D,Testing +gogga,D,Data Analysis +dlog,D,Logging and Monitoring +colorlog,D,Logging and Monitoring +code_checker,D,Code Analysis and Linting +dfmt,D,Data Serialization +dscanner,D,Code Analysis and Linting +dparse,D,Algorithms and Data Structures +silly,D,Algorithms and Data Structures +unit-threaded,D,Testing +fluent-asserts,D,Testing +dests,D,Algorithms and Data Structures +magpie,D,Algorithms and Data Structures +dvec,D,Mathematics and Numerics +d-tree,D,Algorithms and Data Structures +d_dataframes,D,Data Analysis +jsonizer,D,Data Serialization +mir-ion,D,Algorithms and Data Structures +protobuf,D,Data Serialization +siryul,D,Security +iup,D,Graphical User Interface Design +declui,D,Graphical User Interface Design +d_imgui,D,Graphical User Interface Design +dlangui,D,Graphical User Interface Design +libgit2,D,Database Management +yamkeys,D,Security +lua-jit-d,D,IT Automation +led,D,Graphical User Interface Design +array-tool,Rust,Algorithms and Data Structures +petgraph,Rust,Algorithms and Data Structures +heapless,Rust,Algorithms and Data Structures +argon2,Rust,Security +mio,Rust,Networking and Messaging +actix-rt,Rust,Scheduling and Concurrency +socket2,Rust,Networking and Messaging +crossbeam-channel,Rust,Networking and Messaging +cron,Rust,Scheduling and Concurrency +crossbeam-deque,Rust,Algorithms and Data Structures +smolscale,Rust,Data Analysis +job_scheduler,Rust,Scheduling and Concurrency +zeroize,Rust,Security +rocket,Rust,Web Development +rpassword,Rust,Security +trust-dns-resolver,Rust,Networking and Messaging +@farmfe/core,Rust,IT Automation +wasmer-clif-fork-frontend,Rust,Web Development +seed,Rust,Graphical User Interface Design +@farmfe/cli,Rust,IT Automation +num-traits,Rust,Mathematics and Numerics +num,Rust,Mathematics and Numerics +num-bigint,Rust,Mathematics and Numerics +cgmath,Rust,Mathematics and Numerics +rusqlite,Rust,Database Management +redis,Rust,Database Management +diesel,Rust,Database Management +postgres,Rust,Database Management +fs_extra,Rust,File Handling +toml,Rust,Data Serialization +tempfile,Rust,File Handling +zip,Rust,File Handling +log,Rust,Logging and Monitoring +env_logger,Rust,Logging and Monitoring +tracing,Rust,Logging and Monitoring +slog,Rust,Logging and Monitoring +@cubejs-backend/linter,Rust,Code Analysis and Linting +selene-lib,Rust,Data Analysis +ast-grep,Rust,Code Analysis and Linting +cargo-crev,Rust,Code Analysis and Linting +assert_cmd,Rust,Testing +quickcheck,Rust,Testing +proptest,Rust,Testing +wasm-bindgen-test,Rust,Testing +rls-analysis,Rust,Code Analysis and Linting +rstats,Rust,Data Analysis +amadeus-commoncrawl,Rust,Data Analysis +opendp,Rust,Data Analysis +serde,Rust,Data Serialization +serde_json,Rust,Data Serialization +serde_yaml,Rust,Data Serialization +bincode,Rust,Data Serialization +lsio,Rust,File Handling +shuttle-runtime,Rust,IT Automation +rustc_data_structures,Rust,Algorithms and Data Structures +compiler_base_span,Rust,Algorithms and Data Structures +slint,Rust,Algorithms and Data Structures +qinpel-wiz,Rust,Algorithms and Data Structures +arc,Rust,Algorithms and Data Structures +cushy,Rust,Algorithms and Data Structures +tumblr/XExtensionItem,Objective-C,Algorithms and Data Structures +TBQuadTree,Objective-C,Algorithms and Data Structures +POSDataStructures,Objective-C,Algorithms and Data Structures +PESGraph,Objective-C,Algorithms and Data Structures +AFNetworking,Objective-C,Networking and Messaging +CocoaAsyncSocket,Objective-C,Networking and Messaging +Atlas,Objective-C,Graphical User Interface Design +RestKit,Objective-C,Web Development +SZServerTimeManager,Objective-C,Scheduling and Concurrency +CalendarLib,Objective-C,Scheduling and Concurrency +Selene,Objective-C,Security +ZMJGanttChart,Objective-C,Graphical User Interface Design +AWSCognitoIdentityProviderASF,Objective-C,Security +gObfuscator,Objective-C,Security +Lockbox,Objective-C,Security +STPrivilegedTask,Objective-C,IT Automation +vtx,Objective-C,Algorithms and Data Structures +ColendiWebViewSDK,Objective-C,Web Development +@abp/bootstrap-daterangepicker,Objective-C,Web Development +@abp/ng.oauth,Objective-C,Security +vMAT,Objective-C,Mathematics and Numerics +crlibm,Objective-C,Mathematics and Numerics +MCKNumerics,Objective-C,Mathematics and Numerics +ACMatrix,Objective-C,Mathematics and Numerics +DKDBManager,Objective-C,Database Management +FlexileDatabase,Objective-C,Database Management +KKDSqlite,Objective-C,Database Management +SNDBManager,Objective-C,Database Management +APSmartStorage,Objective-C,File Handling +zipzap,Objective-C,File Handling +AliyunOSSiOS,Objective-C,File Handling +YTKKeyValueStore,Objective-C,Data Serialization +github.com/github.com/CocoaLumberjack/CocoaLumberjack,Objective-C,Logging and Monitoring +VENVersionTracker,Objective-C,IT Automation +NSLogger,Objective-C,Logging and Monitoring +NetworkEye,Objective-C,Networking and Messaging +nq-test-react-native-maps,Objective-C,Graphical User Interface Design +KIF,Objective-C,Testing +facebookarchive/xctool,Objective-C,Code Analysis and Linting +xctool,Objective-C,Code Analysis and Linting +KRGreyTheory,Objective-C,Mathematics and Numerics +DataGrinch,Objective-C,Data Analysis +XsdaKit,Objective-C,Data Serialization +cordova-pgyer-dandelion,Objective-C,Web Development +sbjson,Objective-C,Data Serialization +FXParser,Objective-C,Data Analysis +CSV,Objective-C,Data Analysis +NSMutableData+MultipartFormData,Objective-C,File Handling +Masonry,Objective-C,Graphical User Interface Design +Chameleon,Objective-C,Graphical User Interface Design +Nimbus,Objective-C,Graphical User Interface Design +GPUImage,Objective-C,Graphical User Interface Design +infer,Objective-C,Code Analysis and Linting +OCLint,Objective-C,Code Analysis and Linting +sonatype,Objective-C,IT Automation +sigrid,Objective-C,IT Automation +fastlane,Objective-C,IT Automation +hammerspoon,Objective-C,Graphical User Interface Design +punic,Objective-C,IT Automation +jenkins-mobile-pipeline-shared-libraries,Objective-C,IT Automation +brotli,Ocaml,Data Compression +dtoa,Ocaml,Algorithms and Data Structures +bin_tree,Ocaml,Algorithms and Data Structures +base_trie,Ocaml,Algorithms and Data Structures +apero-net,Ocaml,Networking and Messaging +conduit,Ocaml,Networking and Messaging +netamqp,Ocaml,Networking and Messaging +posix-mqueue,Ocaml,File Handling +bap-primus-exploring-scheduler,Ocaml,Scheduling and Concurrency +builder,Ocaml,IT Automation +daypack-lib,Ocaml,Data Analysis +riot,Ocaml,Web Development +tls,Ocaml,Security +osx-acl,Ocaml,Security +content_security_policy,Ocaml,Security +aws-sts,Ocaml,Security +async_websocket,Ocaml,Web Development +benchpress-server,Ocaml,Web Development +builder-web,Ocaml,Web Development +cduce_ws,Ocaml,Web Development +posix-math,Ocaml,Mathematics and Numerics +smol,Ocaml,Data Serialization +crlibm,Ocaml,Mathematics and Numerics +lem,Ocaml,Code Analysis and Linting +caqti,Ocaml,Database Management +dbforge,Ocaml,Database Management +irmin,Ocaml,Database Management +links-mysql,Ocaml,Database Management +bitlib,Ocaml,Algorithms and Data Structures +chamelon,Ocaml,Web Development +fpath,Ocaml,File Handling +fileutils,Ocaml,File Handling +bolt,Ocaml,Algorithms and Data Structures +dolog,Ocaml,Logging and Monitoring +easy_logging,Ocaml,Logging and Monitoring +loga,Ocaml,Logging and Monitoring +bisect_ppx,Ocaml,Code Analysis and Linting +calli,Ocaml,Algorithms and Data Structures +clangml-transforms,Ocaml,Algorithms and Data Structures +dolmen_bin,Ocaml,Algorithms and Data Structures +base_quickcheck,Ocaml,Testing +caravan,Ocaml,Web Development +kaputt,Ocaml,Algorithms and Data Structures +ounit2,Ocaml,Testing +conformist,Ocaml,Code Analysis and Linting +dataframe,Ocaml,Data Analysis +dsfo,Ocaml,Data Analysis +llama_midi,Ocaml,Graphical User Interface Design +atdgen,Ocaml,Code Analysis and Linting +bitpack_serializer,Ocaml,Data Serialization +coq-serapi,Ocaml,Algorithms and Data Structures +grpc,Ocaml,Networking and Messaging +bap-build,Ocaml,IT Automation +argsh,Ocaml,IT Automation +conf-automake,Ocaml,IT Automation +dtools,Ocaml,IT Automation +bogue,Ocaml,Algorithms and Data Structures +unison-gui,Ocaml,Graphical User Interface Design +imguiml,Ocaml,Graphical User Interface Design +altgr-ergo,Ocaml,Algorithms and Data Structures +bk-tree,Haskell,Algorithms and Data Structures +algebraic-graphs,Haskell,Algorithms and Data Structures +recursion-schemes,Haskell,Algorithms and Data Structures +AvlTree,Haskell,Algorithms and Data Structures +grenade,Haskell,Security +network-conduit,Haskell,Networking and Messaging +streamly,Haskell,Algorithms and Data Structures +hedgehog,Haskell,Testing +haxl,Haskell,Web Development +amazonka-scheduler,Haskell,Scheduling and Concurrency +massiv-scheduler,Haskell,Scheduling and Concurrency +gogol-datafusion,Haskell,Data Analysis +tamarin-prover-theory,Haskell,Mathematics and Numerics +tamarin-prover,Haskell,Mathematics and Numerics +yst,Haskell,Data Analysis +fireward,Haskell,Security +snap-core,Haskell,Web Development +snap-server,Haskell,Web Development +gogol-pagespeed,Haskell,Web Development +gogol-indexing,Haskell,Data Analysis +pandoc,Haskell,Data Serialization +Agda,Haskell,Mathematics and Numerics +math-functions,Haskell,Mathematics and Numerics +commodities,Haskell,Data Analysis +gogol-spanner,Haskell,Database Management +gogol-sqladmin,Haskell,Database Management +gogol-datastore,Haskell,Database Management +dbmigrations,Haskell,Database Management +bytestring,Haskell,File Handling +io-streams,Haskell,File Handling +regions,Haskell,Algorithms and Data Structures +amazonka-kinesis-video-webrtc-storage,Haskell,Data Analysis +tensorflow-logging,Haskell,Logging and Monitoring +wai-extra,Haskell,Web Development +co-log,Haskell,Logging and Monitoring +gogol-cloudmonitoring,Haskell,IT Automation +pandoc,Haskell,Data Serialization +cassava,Haskell,Data Analysis +commonmark,Haskell,Data Serialization +auto,Haskell,Code Analysis and Linting +amazonka-devops-guru,Haskell,IT Automation +deptrack-devops,Haskell,IT Automation +gogol-testing,Haskell,Testing +LogicGrowsOnTrees,Haskell,Algorithms and Data Structures +gogol-datafusion,Haskell,Data Analysis +vty-ui,Haskell,Graphical User Interface Design +YampaSynth,Haskell,Algorithms and Data Structures +master-plan,Haskell,IT Automation +stan,Haskell,Data Analysis +hlint,Haskell,Code Analysis and Linting +liquidhaskell,Haskell,Code Analysis and Linting +ghc,Haskell,IT Automation +purescript,Haskell,Code Analysis and Linting +ghcide-test-utils,Haskell,Testing +hls-test-utils,Haskell,Testing +yesod-test,Haskell,Testing +statistics,Haskell,Mathematics and Numerics +statistics-skinny,Haskell,Mathematics and Numerics +ajhc,Haskell,Code Analysis and Linting +fortran-src,Haskell,Algorithms and Data Structures +BitVector,Nim,Algorithms and Data Structures +rbtree,Nim,Algorithms and Data Structures +binaryheap,Nim,Algorithms and Data Structures +algorithm,Nim,Algorithms and Data Structures +nativesockets,Nim,Networking and Messaging +net,Nim,Networking and Messaging +nimrdkafka,Nim,Networking and Messaging +mqtt,Nim,Networking and Messaging +monotimes,Nim,Scheduling and Concurrency +times,Nim,Scheduling and Concurrency +osproc,Nim,IT Automation +schedules,Nim,Scheduling and Concurrency +nimcrypt,Nim,Security +seccomp,Nim,Security +nimpass,Nim,Security +quickcrypt,Nim,Security +nerve,Nim,Networking and Messaging +palladian,Nim,Web Development +staticserver,Nim,Web Development +phoon,Nim,Web Development +seqmath,Nim,Mathematics and Numerics +extmath,Nim,Mathematics and Numerics +geometrymath,Nim,Mathematics and Numerics +neo,Nim,Database Management +niledb,Nim,Database Management +couchdb,Nim,Database Management +zfdbms,Nim,Database Management +pdba,Nim,Database Management +osfiles,Nim,File Handling +fileinput,Nim,File Handling +filetype,Nim,File Handling +stor,Nim,File Handling +octolog,Nim,Logging and Monitoring +morelogging,Nim,Logging and Monitoring +promexplorer,Nim,Data Analysis +metrics,Nim,Data Analysis +nimfmt,Nim,Code Analysis and Linting +coco,Nim,Code Analysis and Linting +treesitter,Nim,Code Analysis and Linting +nimalyzer,Nim,Code Analysis and Linting +testify,Nim,Testing +nimtest,Nim,Testing +testutils,Nim,Testing +halonium,Nim,Networking and Messaging +nimdata,Nim,Data Analysis +datamancer,Nim,Data Analysis +nimdataframe,Nim,Data Analysis +mpfit,Nim,Mathematics and Numerics +tomlserialization,Nim,Data Serialization +protobufserialization,Nim,Data Serialization +bson,Nim,Data Serialization +eminim,Nim,Algorithms and Data Structures +autome,Nim,IT Automation +monit,Nim,Logging and Monitoring +autonim,Nim,IT Automation +nake,Nim,IT Automation +nimblegui,Nim,Graphical User Interface Design +nigui,Nim,Graphical User Interface Design +sigui,Nim,Graphical User Interface Design +rdgui,Nim,Graphical User Interface Design +de.sciss:fingertree_2.11,Scala,Algorithms and Data Structures +org.scalameta:semanticdb-scalac-core_2.11.12,Scala,Code Analysis and Linting +org.axle-lang:axle-algorithms_2.11,Scala,Algorithms and Data Structures +de.sciss:strugatzki_2.10,Scala,Algorithms and Data Structures +org.apache.spark:spark-network-common_2.11,Scala,Networking and Messaging +com.github.molecule-labs:molecule-net_2.9.3,Scala,Networking and Messaging +org.elasticmq,Scala,Database Management +com.typesafe.akka:akka-stream_2.12,Scala,Networking and Messaging +com.miguno.akka:akka-mock-scheduler_2.11,Scala,Scheduling and Concurrency +com.enragedginger:akka-quartz-scheduler_2.11,Scala,Scheduling and Concurrency +edu.gemini:lucuma-typed-scheduler_sjs1_3,Scala,Scheduling and Concurrency +io.getkyo:kyo-scheduler_2.13,Scala,Scheduling and Concurrency +dev.zio:zio-json_3,Scala,Data Serialization +dev.zio:zio-json_2.12,Scala,Data Serialization +recheck,Scala,Code Analysis and Linting +org.beangle.security:beangle-security-core,Scala,Security +com.softwaremill.sttp:async-http-client-backend-future_2.12,Scala,Web Development +com.softwaremill.sttp:akka-http-backend_2.12,Scala,Web Development +com.eed3si9n:gigahorse-okhttp_2.12,Scala,Web Development +com.softwaremill.sttp.client3:slf4j-backend_2.12,Scala,Logging and Monitoring +com.github.vagmcs:optimus_2.11,Scala,Mathematics and Numerics +com.github.vagmcs:optimus-solver-oj_2.11,Scala,Mathematics and Numerics +io.github.scalamath:vecmatlib,Scala,Mathematics and Numerics +io.github.scalamath:cmplxlib,Scala,Mathematics and Numerics +com.typesafe.slick:slick_2.11,Scala,Database Management +org.tpolecat:doobie-core_2.12,Scala,Database Management +org.reactivemongo:reactivemongo_2.11,Scala,Database Management +org.tpolecat:doobie-postgres_2.12,Scala,Database Management +org.specs2:specs2_2.11,Scala,Testing +com.github.pathikrit:better-files_2.12,Scala,File Handling +com.github.scala-incubator.io:scala-io-file_2.10,Scala,File Handling +de.sciss:audiofile_2.11,Scala,Data Analysis +com.typesafe.scala-logging:scala-logging_2.12,Scala,Logging and Monitoring +com.typesafe.scala-logging:scala-logging-slf4j_2.11,Scala,Logging and Monitoring +org.clapper:grizzled-slf4j_2.11,Scala,Logging and Monitoring +com.outr:scribe_2.12,Scala,Data Serialization +org.psywerx.hairyfotr.linter,Scala,Code Analysis and Linting +scala.meta.parsers,Scala,Algorithms and Data Structures +org.scalastyle,Scala,Code Analysis and Linting +com.sksamuel.scapegoat,Scala,Code Analysis and Linting +org.scala-js:scalajs-test-bridge_2.13,Scala,Testing +org.scala-js:scalajs-test-interface_2.12,Scala,Testing +com.typesafe.play:play-test_2.11,Scala,Testing +org.scalatest:scalatest_2.9.1,Scala,Testing +org.finra.megasparkdiff:mega-spark-diff,Scala,Data Analysis +com.github.vicpara:exploratory-data-analysis_2.10,Scala,Data Analysis +org.emmalanguage:emma,Scala,Data Analysis +org.emmalanguage:emma-benchmarks,Scala,Data Analysis +org.simplex3d:simplex3d-data-format_2.10,Scala,Data Serialization +org.wvlet.airframe:airframe-tablet_2.13.0-RC2,Scala,Data Serialization +org.gnieh:fs2-data-text_2.13,Scala,Data Serialization +com.fasterxml.jackson.module:jackson-module-scala_2.12,Scala,Data Serialization +tech.orkestra:orkestra-core_sjs0.6_2.12,Scala,IT Automation +com.goyeau:orchestra-cron_2.12,Scala,Scheduling and Concurrency +com.aamend.spark:archetype,Scala,IT Automation +io.kevinlee:sbt-devoops-github-core_2.12_1.0,Scala,IT Automation +de.sciss:dotterweide-ui_2.11,Scala,Graphical User Interface Design +org.scala-lang.modules.scala-swing,Scala,Graphical User Interface Design +io.github.kacperfkorban.guinep-web,Scala,Web Development +io.github.mimoguz.layeredfonticon-core,Scala,Graphical User Interface Design +piecemeal,Dart,Algorithms and Data Structures +collection,Dart,Algorithms and Data Structures +pointycastle,Dart,Security +graphs,Dart,Algorithms and Data Structures +connectivity_plus,Dart,Networking and Messaging +cached_network_image,Dart,File Handling +connectivity,Dart,Networking and Messaging +firebase_messaging,Dart,Networking and Messaging +reflutter,Dart,Web Development +server_universe,Dart,Web Development +create-fullstack-app-cli,Dart,IT Automation +angel_graphql,Dart,Web Development +flutter_local_notifications,Dart,Graphical User Interface Design +cron,Dart,Scheduling and Concurrency +timer_builder,Dart,Scheduling and Concurrency +syncfusion_flutter_calendar,Dart,Graphical User Interface Design +google_sign_in,Dart,Security +mqtt_client,Dart,Networking and Messaging +angel_security,Dart,Security +envied,Dart,Code Analysis and Linting +math_expressions,Dart,Mathematics and Numerics +more,Dart,Algorithms and Data Structures +ml_linalg,Dart,Mathematics and Numerics +fixed,Dart,Algorithms and Data Structures +sqflite,Dart,Database Management +cloud_firestore,Dart,Database Management +postgres,Dart,Database Management +hive,Dart,Database Management +path_provider,Dart,File Handling +image,Dart,Graphical User Interface Design +glob,Dart,File Handling +file,Dart,File Handling +logging,Dart,Logging and Monitoring +logger,Dart,Logging and Monitoring +ansicolor,Dart,Logging and Monitoring +pretty_dio_logger,Dart,Logging and Monitoring +flutter_lints,Dart,Code Analysis and Linting +pedantic_mono,Dart,Code Analysis and Linting +carapacik_lints,Dart,Code Analysis and Linting +velvet_custom_lints,Dart,Code Analysis and Linting +test,Dart,Testing +unittest,Dart,Testing +build_test,Dart,Testing +mocktail,Dart,Testing +grizzly_array,Dart,Algorithms and Data Structures +flutter_insights,Dart,Data Analysis +packhorse,Dart,IT Automation +plugin_mappintelligence,Dart,IT Automation +yaml,Dart,Data Serialization +http_parser,Dart,Web Development +built_value,Dart,Data Serialization +bson,Dart,Data Serialization +unleash,Dart,IT Automation +docrunner,Dart,IT Automation +cobertura,Dart,Code Analysis and Linting +bitwarden_secrets,Dart,Security +magical_widget,Dart,Graphical User Interface Design +flutter_auto_gui,Dart,Graphical User Interface Design +gui_shape,Dart,Graphical User Interface Design +rinf,Dart,Algorithms and Data Structures +collections,Python,Algorithms and Data Structures +heapq,Python,Algorithms and Data Structures +algorithms,Python,Algorithms and Data Structures +sortedcontainers,Python,Algorithms and Data Structures +asyncio,Python,Networking and Messaging +socket,Python,Networking and Messaging +kafka-python,Python,Networking and Messaging +dramatiq,Python,Networking and Messaging +arrow,Python,Scheduling and Concurrency +dateutil,Python,Scheduling and Concurrency +threading-framework,Python,Scheduling and Concurrency +schedule,Python,Scheduling and Concurrency +authlib,Python,Security +pyjwt,Python,Security +django-allauth,Python,Security +cryptography,Python,Security +webassets,Python,Web Development +html2text,Python,Web Development +websockets,Python,Web Development +tornado,Python,Web Development +scipy,Python,Mathematics and Numerics +numpy,Python,Mathematics and Numerics +statsmodel,Python,Mathematics and Numerics +sympy,Python,Mathematics and Numerics +pymysql,Python,Database Management +psycopg,Python,Database Management +pymongo,Python,Database Management +pickledb,Python,Database Management +mimetypes,Python,File Handling +pathlib,Python,File Handling +python-magic,Python,File Handling +wqtchdog,Python,Scheduling and Concurrency +logging,Python,Logging and Monitoring +structlog,Python,Logging and Monitoring +loguru,Python,Logging and Monitoring +psutil,Python,System Administration +flake8,Python,Code Analysis and Linting +pyflakes,Python,Code Analysis and Linting +pycodestyle,Python,Code Analysis and Linting +pylint,Python,Code Analysis and Linting +mamba,Python,Testing +pytest,Python,Testing +unittest,Python,Testing +selenium,Python,Web Development +pandas,Python,Data Analysis +optimus,Python,Data Analysis +schema,Python,Data Analysis +pydantic,Python,Data Serialization +marshmallow,Python,Data Serialization +pysimdjson,Python,Data Serialization +json,Python,Data Serialization +prophy,Python,Data Analysis +ansible,Python,IT Automation +pyinfra,Python,IT Automation +fabric,Python,IT Automation +borg,Python,System Administration +tkinter,Python,Graphical User Interface Design +pyglet,Python,Graphical User Interface Design +pyqt,Python,Graphical User Interface Design +kivy,Python,Graphical User Interface Design +Graph,Perl,Algorithms and Data Structures +MetaMap-DataStructures,Perl,Algorithms and Data Structures +Array-Circular,Perl,Algorithms and Data Structures +Tree-R,Perl,Algorithms and Data Structures +NetAddr-MAC,Perl,Networking and Messaging +Net-OpenSSH,Perl,Networking and Messaging +Parse-IPCommand,Perl,Networking and Messaging +Net-SSH2,Perl,Networking and Messaging +docpad-plugin-scheduling,Perl,Scheduling and Concurrency +Async-Event-Interval,Perl,Scheduling and Concurrency +Schedule-SGELK,Perl,Scheduling and Concurrency +Mojolicious-Plugin-Cron-Scheduler,Perl,Scheduling and Concurrency +DBIx-Class-BcryptColumn,Perl,Security +Crypt-DRBG,Perl,Security +WWW-KeePassRest,Perl,Web Development +Plack-Middleware-SecureHeaders,Perl,Security +Mojolicious,Perl,Web Development +Dancer2,Perl,Web Development +Catalyst,Perl,Web Development +Kossy,Perl,Web Development +SPVM-Math,Perl,Mathematics and Numerics +App-Math-Tutor,Perl,Mathematics and Numerics +Math-RPN-Tiny,Perl,Mathematics and Numerics +Math-Sidef,Perl,Mathematics and Numerics +DBD-mysql,Perl,Database Management +Redis,Perl,Database Management +github.com/percona/percona-toolkit,Perl,Database Management +Database-Abstraction,Perl,Database Management +Path-Tiny,Perl,File Handling +File-Util,Perl,File Handling +PDF-API2,Perl,Data Serialization +IO-All,Perl,File Handling +CPAN-Testers-Schema,Perl,Data Analysis +Log-Report,Perl,Logging and Monitoring +Log-Contextual,Perl,Logging and Monitoring +event-tracer,Perl,Logging and Monitoring +Perl-Lint,Perl,Code Analysis and Linting +Perl-Critic,Perl,Code Analysis and Linting +B-Lint,Perl,Code Analysis and Linting +Perl-Analyzer,Perl,Code Analysis and Linting +Test-Strict,Perl,Testing +Math-BigInt,Perl,Mathematics and Numerics +Test-MockModule,Perl,Testing +Test-Without-Module,Perl,Testing +CLIPSeqTools,Perl,Data Analysis +App-RecordStream,Perl,Data Analysis +Data::Table,Perl,Data Analysis +PDL::Dataframe,Perl,Data Analysis +wxPerl,Perl,Graphical User Interface Design +Perl-Tk,Perl,Graphical User Interface Design +Prima,Perl,Graphical User Interface Design +Perl/KDE,Perl,Graphical User Interface Design +AnyData,Perl,Data Serialization +Data-Format-Pretty-YAML,Perl,Data Serialization +TOML-Tiny,Perl,Data Serialization +CatalystX-Controller-ExtJS-REST-SimpleExcel,Perl,Web Development +Rex,Perl,IT Automation +com.viliussutkus89:SampleLibraryForSonatypePromotionTesting,Perl,IT Automation +Jenkins::API,Perl,IT Automation +Minilla,Perl,IT Automation +@discordjs/collection,TypeScript,Algorithms and Data Structures +js-sdsl,TypeScript,Algorithms and Data Structures +typescript-collections,TypeScript,Algorithms and Data Structures +fast-array-diff,TypeScript,Algorithms and Data Structures +libp2p,TypeScript,Networking and Messaging +@multiformats/multiaddr,TypeScript,Networking and Messaging +@ethersproject/networks,TypeScript,Networking and Messaging +nats,TypeScript,Networking and Messaging +@types/node-schedule,TypeScript,Scheduling and Concurrency +agenda,TypeScript,Scheduling and Concurrency +@nestjs/schedule,TypeScript,Scheduling and Concurrency +@solid-primitives/scheduled,TypeScript,Scheduling and Concurrency +helmet,TypeScript,Security +snyk,TypeScript,Security +express-rate-limit,TypeScript,Web Development +jssha,TypeScript,Security +vite,TypeScript,Web Development +vue-template-compiler,TypeScript,Web Development +@testing-library/user-event,TypeScript,Testing +antd,TypeScript,Graphical User Interface Design +random-js,TypeScript,Mathematics and Numerics +math-expression-evaluator,TypeScript,Mathematics and Numerics +normal-distribution,TypeScript,Mathematics and Numerics +@mathigon/fermat,TypeScript,Mathematics and Numerics +mongodb,TypeScript,Database Management +sequelize,TypeScript,Database Management +firebase,TypeScript,Database Management +typeorm,TypeScript,Database Management +rollup-plugin-dts,TypeScript,Code Analysis and Linting +tsx,TypeScript,Code Analysis and Linting +ts-node-dev,TypeScript,Code Analysis and Linting +serve,TypeScript,Web Development +@oclif/errors,TypeScript,Error Handling +@storybook/addon-console,TypeScript,Graphical User Interface Design +conventional-changelog-writer,TypeScript,IT Automation +git-raw-commits,TypeScript,IT Automation +@codemirror/lint,TypeScript,Code Analysis and Linting +@start/plugin-lib-eslint,TypeScript,Code Analysis and Linting +remark-lint-fenced-code-flag-case,TypeScript,Code Analysis and Linting +tslint-rxjs-subject-restrictions-rule,TypeScript,Code Analysis and Linting +jest,TypeScript,Testing +ts-jest,TypeScript,Testing +babel-jest,TypeScript,Testing +vitest,TypeScript,Testing +data-forge,TypeScript,Data Analysis +vue-component-meta,TypeScript,Graphical User Interface Design +@opticss/element-analysis,TypeScript,Graphical User Interface Design +@antv/l7-scene,TypeScript,Graphical User Interface Design +table,TypeScript,Data Analysis +form-data-encoder,TypeScript,File Handling +ion-js,TypeScript,Web Development +@nsis/language-data,TypeScript,Data Analysis +docker-compose,TypeScript,IT Automation +commitlint-azure-pipelines-cli,TypeScript,IT Automation +azure-devops-node-api,TypeScript,IT Automation +@karmaniverous/get-dotenv,TypeScript,File Handling +happy-dom,TypeScript,Graphical User Interface Design +react-png-tooltip,TypeScript,Graphical User Interface Design +infamous,TypeScript,Graphical User Interface Design +lume,TypeScript,Web Development +github.com/davecgh/go-spew,Go,Algorithms and Data Structures +github.com/google/btree,Go,Algorithms and Data Structures +github.com/lann/ps,Go,Algorithms and Data Structures +github.com/cespare/xxhash/v2,Go,Algorithms and Data Structures +golang.org/x/net,Go,Networking and Messaging +github.com/vishvananda/netns,Go,Networking and Messaging +github.com/nats-io/nats,Go,Networking and Messaging +github.com/jackc/pgproto3/v2,Go,Database Management +k8s.io/kubernetes,Go,IT Automation +github.com/go-co-op/gocron,Go,Scheduling and Concurrency +atomicgo.dev/schedule,Go,Scheduling and Concurrency +github.com/jasonlvhit/gocron,Go,Scheduling and Concurrency +github.com/google/uuid,Go,Algorithms and Data Structures +github.com/golang-jwt/jwt/v4,Go,Security +github.com/microcosm-cc/bluemonday,Go,Security +github.com/99designs/keyring,Go,Security +github.com/gin-gonic/gin,Go,Web Development +github.com/go-redis/cache/v8,Go,Database Management +github.com/gorilla/sessions,Go,Web Development +github.com/labstack/echo/v4,Go,Web Development +gopkg.in/inf.v0,Go,Algorithms and Data Structures +github.com/go-corelibs/maths,Go,Mathematics and Numerics +github.com/go-inf/inf,Go,Algorithms and Data Structures +github.com/pkg/math,Go,Mathematics and Numerics +github.com/go-sql-driver/mysql,Go,Database Management +github.com/lib/pq,Go,Database Management +go.mongodb.org/mongo-driver,Go,Database Management +go.etcd.io/bbolt,Go,Database Management +github.com/pelletier/go-toml/v2,Go,Data Serialization +github.com/joho/godotenv,Go,File Handling +cloud.google.com/go/storage,Go,Database Management +github.com/minio/minio-go/v7,Go,Database Management +github.com/sirupsen/logrus,Go,Logging and Monitoring +go.uber.org/zap,Go,Logging and Monitoring +github.com/go-logr/logr,Go,Logging and Monitoring +go.opentelemetry.io/otel,Go,Logging and Monitoring +golang.org/x/lint,Go,Code Analysis and Linting +github.com/golangci/lint-1,Go,Code Analysis and Linting +github.com/mvdan/lint,Go,Code Analysis and Linting +github.com/golang/lint,Go,Code Analysis and Linting +github.com/stretchr/testify,Go,Testing +github.com/google/go-cmp,Go,Code Analysis and Linting +gopkg.in/check.v1,Go,Testing +github.com/onsi/ginkgo,Go,Testing +github.com/rocketlaunchr/dataframe-go,Go,Data Analysis +github.com/fjukstad/walrus,Go,Algorithms and Data Structures +github.com/hokiegeek/hgtealib,Go,Algorithms and Data Structures +github.com/forchain/bitcoinbigdata,Go,Data Analysis +github.com/google/orderedcode,Go,Code Analysis and Linting +github.com/ipfs/go-block-format,Go,File Handling +github.com/linkedin/goavro/v2,Go,Data Serialization +github.com/minio/sio,Go,File Handling +github.com/power-devops/perfstat,Go,Logging and Monitoring +github.com/gruntwork-io/terratest,Go,Testing +go.mozilla.org/sops/v3,Go,Security +github.com/vladimirvivien/gexe,Go,Algorithms and Data Structures +qtypes,Go,Algorithms and Data Structures +github.com/ctessum/gobra,Go,Algorithms and Data Structures +github.com/yogischogi/ui2go,Go,Graphical User Interface Design +github.com/bhojpur/gui,Go,Graphical User Interface Design diff --git a/transforms/code/semantic_profiler/ray/src/ikb/null_libs.csv b/transforms/code/semantic_profiler/ray/src/ikb/null_libs.csv new file mode 100644 index 000000000..821e3406f --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/ikb/null_libs.csv @@ -0,0 +1,14 @@ +Library,Language +dynamic_bitset,Cpp +tries,Cpp +algorithm,Cpp +uni-algo,Cpp +boost.asio,Cpp +cpp-netlib,Cpp +zmq,Cpp +azmq,Cpp +thread-pool,Cpp +chrono,Cpp +concurrencpp,Cpp +time,Cpp +libressl,Cpp \ No newline at end of file diff --git a/transforms/code/semantic_profiler/ray/src/offline_path/generate_ikb.py b/transforms/code/semantic_profiler/ray/src/offline_path/generate_ikb.py new file mode 100644 index 000000000..04e5a7c01 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/offline_path/generate_ikb.py @@ -0,0 +1,110 @@ +import os +import argparse +import pyarrow as pa +import pyarrow.csv as pv +from io import StringIO,BytesIO +from watsonxai import generateResponseWatsonx + + +def gen_combined_strings(list_str): + combined_strings = [] + combined_string = "\nLibrary,Language,Category\n" + for idx, entry in enumerate(list_str, start=1): + entry_string = ",".join([f"{value}" for key, value in entry.items()]) + combined_string += f"{entry_string}\n" + if idx % 30 == 0 or idx == len(list_str): + combined_strings.append(combined_string) + combined_string = "Library,Language,Category\n" + return combined_strings + + +def sanitize_table(table): + sanitized_columns = [] + for column in table.columns: + sanitized_data = column.to_pylist() + sanitized_data = [str(val).replace('"', '') for val in sanitized_data] + sanitized_column = pa.array(sanitized_data) + sanitized_columns.append(sanitized_column) + sanitized_table = pa.table(sanitized_columns, names=table.column_names) + return sanitized_table + +parser = argparse.ArgumentParser(description='Generate IKB.') +parser.add_argument('--null_libs_file', type=str, help='Path to null libraries file.', default=os.getenv('NULL_LIBS_FILE', '../ikb/null_libs.csv')) +parser.add_argument('--cmap_file', type=str, help='Path to concept map file.', default=os.getenv('CMAP_FILE', '../concept_map/updated_concept_list.csv')) +parser.add_argument('--input_examples_file', type=str, help='Path to input examples file.', default=os.getenv('EXAMPLES_I_FILE', '../examples/examples-i.csv')) +parser.add_argument('--output_examples_file', type=str, help='Path to output examples file.', default=os.getenv('EXAMPLES_O_FILE', '../examples/examples-o.csv')) +parser.add_argument('--extracted_data_file', type=str, help='Path to file in which LLM output will be stored.', default=os.getenv('EXTRACTED_DATA_FILE', '../ikb/extracted_data.csv')) +parser.add_argument('--api_type', type=str, help='API Type', default=os.getenv('API_TYPE', 'WatsonxAI')) +parser.add_argument('--api_key', type=str, help='API key', default=os.getenv('API_KEY', '')) +parser.add_argument('--api_endpoint', type=str, help='API endpoint', default=os.getenv('API_ENDPOINT', 'https://us-south.ml.cloud.ibm.com')) +parser.add_argument('--project_id', type=str, help='Project ID', default=os.getenv('PROJECT_ID', '')) +parser.add_argument('--model_id', type=str, help='LLM model ID', default=os.getenv('MODEL_ID', 'meta-llama/llama-3-70b-instruct')) + + + + +args = parser.parse_args() +concepts_list = pv.read_csv(args.cmap_file).column('Category').to_pylist() +concepts = ', '.join(concepts_list) + +csv_buffer_i = BytesIO() +pv.write_csv(pv.read_csv(args.input_examples_file), csv_buffer_i) +input_examples = csv_buffer_i.getvalue() + +csv_buffer_o = BytesIO() +pv.write_csv(pv.read_csv(args.output_examples_file), csv_buffer_o) +output_examples = csv_buffer_o.getvalue() + +cols=['Library', 'Language'] +table = pv.read_csv(args.null_libs_file, read_options=pv.ReadOptions(column_names=cols)) +null_library_names = [{col: table[i][j].as_py() for i, col in enumerate(cols)} for j in range(len(table))] +combined_strings = gen_combined_strings(null_library_names) +endtoken = "" + +prompt_name = "My-prompt" +prompt_template = '''You are responsible for classifying programming language packages based on their functionality into one of the following STRICT categories: + ''' + concepts + ''' + + Instructions: + + 1. Input: A CSV containing two columns: + a. Library – the name of the package + b. Language – the programming language of the package + Your task is to append a third column called Category where you will classify the package's primary function into one of the following categories.\n + + 2. Output: The updated CSV with the new Category column. + + 3. Categorization Guidelines: + a. Classify each package based on its primary functionality. + b. Only use categories from the given list. Do not invent or modify categories. + + 4. Output format: Provide the updated CSV data in the exact format as shown below: + a. Columns: Library, Language, Category + b. End the response with to indicate completion. + + 5. Only use categories from the given list. Do not invent or modify categories. + + 6. Strictly do not provide any explanations or commentary or notes before and/or after the table. + + Examples: + INPUT: + ''' + str(input_examples) + "OUTPUT:\n" + str(output_examples).strip("\n")+"\n" + + +for combined_string in combined_strings: + input_template = prompt_template + f"\n\nINPUT: {combined_string} \nOUTPUT: " + if args.api_type == 'WatsonxAI': + response = generateResponseWatsonx(args.api_key, args.api_endpoint, args.model_id, args.project_id, input_template) + data = response.split(endtoken)[0] + csv_file = BytesIO(data.strip().encode('utf-8')) + table = pv.read_csv(csv_file) + table = sanitize_table(table) + with open(args.extracted_data_file, mode='ab') as f: + pv.write_csv(table, f, write_options=pv.WriteOptions(include_header=False)) + + + + + + + diff --git a/transforms/code/semantic_profiler/ray/src/offline_path/watsonxai.py b/transforms/code/semantic_profiler/ray/src/offline_path/watsonxai.py new file mode 100644 index 000000000..bb27137c2 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/offline_path/watsonxai.py @@ -0,0 +1,25 @@ +from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams +from ibm_watsonx_ai.foundation_models import ModelInference +from ibm_watsonx_ai import Credentials + + + + +def generateResponseWatsonx(api_key, api_endpoint, model_id, project_id, input_template): + credentials = Credentials(api_key=api_key, url=api_endpoint) + parameters = { + GenParams.DECODING_METHOD: "greedy", + GenParams.MAX_NEW_TOKENS: 1000, + GenParams.STOP_SEQUENCES: [""] + } + model = ModelInference( + model_id=model_id, + params=parameters, + credentials=credentials, + project_id=project_id) + response = model.generate_text(input_template) + return response + + + + diff --git a/transforms/code/semantic_profiler/ray/src/sp_local_ray.py b/transforms/code/semantic_profiler/ray/src/sp_local_ray.py new file mode 100644 index 000000000..4e9e498b1 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/sp_local_ray.py @@ -0,0 +1,52 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from sp_transform_ray import SemanticProfilerRayTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # noop params + "sp_ikb_file": "src/ikb/ikb_model.csv", + "sp_null_libs_file": "src/ikb/null_libs.csv" +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(SemanticProfilerRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/code/semantic_profiler/ray/src/sp_transform_ray.py b/transforms/code/semantic_profiler/ray/src/sp_transform_ray.py new file mode 100644 index 000000000..b54cb2536 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/src/sp_transform_ray.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from sp_transform import SemanticProfilerTransformConfiguration + + +logger = get_logger(__name__) + + +class SemanticProfilerRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for SP as required by the RayTransformLauncher. + SP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SemanticProfilerTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(SemanticProfilerRayTransformConfiguration()) + logger.info("Launching sp transform") + launcher.launch() diff --git a/transforms/code/semantic_profiler/ray/test-data/expected/metadata.json b/transforms/code/semantic_profiler/ray/test-data/expected/metadata.json new file mode 100644 index 000000000..eed590d79 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/test-data/expected/metadata.json @@ -0,0 +1,46 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "NOOP", + "job type": "ray", + "job id": "job_id", + "start_time": "2024-03-01 15:17:56", + "end_time": "2024-03-01 15:17:57", + "status": "success" + }, + "code": [null], + "job_input_params": { + "sleep": 0, + "checkpointing": false, + "max_files": -1, + "number of workers": 1, + "worker options": { + "num_cpus": 0.8 + }, + "actor creation delay": 0 + }, + "execution_stats": { + "cpus": 10, + "gpus": 0, + "memory": 14.031964112073183, + "object_store": 2.0 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 16534, + "result_files": 1, + "result_size": 16534, + "table_processing": 0.012392997741699219, + "nfiles": 1, + "nrows": 5 + }, + "source": { + "name": "test-data/data_processing/ray/noop/input", + "type": "path" + }, + "target": { + "name": "/tmp/NOOP4o9gv2bq", + "type": "path" + } +} diff --git a/transforms/code/semantic_profiler/ray/test-data/expected/test.parquet b/transforms/code/semantic_profiler/ray/test-data/expected/test.parquet new file mode 100644 index 000000000..748db85ba Binary files /dev/null and b/transforms/code/semantic_profiler/ray/test-data/expected/test.parquet differ diff --git a/transforms/code/semantic_profiler/ray/test-data/input/test.parquet b/transforms/code/semantic_profiler/ray/test-data/input/test.parquet new file mode 100644 index 000000000..f9ac1f024 Binary files /dev/null and b/transforms/code/semantic_profiler/ray/test-data/input/test.parquet differ diff --git a/transforms/code/semantic_profiler/ray/test/test_sp_ray.py b/transforms/code/semantic_profiler/ray/test/test_sp_ray.py new file mode 100644 index 000000000..3b7daa890 --- /dev/null +++ b/transforms/code/semantic_profiler/ray/test/test_sp_ray.py @@ -0,0 +1,47 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from sp_transform import ikb_file_cli_param,null_libs_file_cli_param +from sp_transform_ray import SemanticProfilerRayTransformConfiguration + + +class TestRaySemanticProfilerTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + + launcher = RayTransformLauncher(SemanticProfilerRayTransformConfiguration()) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + runtime_config = {"run_locally": True} + transform_config = {ikb_file_cli_param: os.path.join(src_file_dir, "../src/ikb/ikb_model.csv"), null_libs_file_cli_param: os.path.join(src_file_dir,"../src/ikb/null_libs.csv")} + fixtures.append( + ( + launcher, + transform_config | runtime_config, + input_dir, + expected_dir, + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + return fixtures