datahub-project · k-bartlett · Oct 3, 2024 · Oct 8, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/datahub-web-react/src/images/neo4j.png b/datahub-web-react/src/images/neo4j.png
diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j.md b/metadata-ingestion/docs/sources/neo4j/neo4j.md
@@ -0,0 +1,155 @@
+# Source Name 
+
+<!-- Set Support Status -->
+![Certified](https://img.shields.io/badge/support%20status-certified-brightgreen)
+![Incubating](https://img.shields.io/badge/support%20status-incubating-blue)
+![Testing](https://img.shields.io/badge/support%20status-testing-lightgrey)
+
+## Integration Details
+
+<!-- Plain-language description of what this integration is meant to do.  -->
+<!-- Include details about where metadata is extracted from (ie. logs, source API, manifest, etc.)   -->
+Neo4j metadata will be ingested into DataHub using Call apoc.meta.data();  The data that is returned will be parsed 
+and will be displayed as Nodes and Relationships in DataHub.  Each object will be tagged with describing what kind of DataHub
+object it is.  The defaults are 'Node' and 'Relationship'.  These tag values can be overwritten in the recipe.
+
+
+
+## Metadata Ingestion Quickstart
+
+### Prerequisites
+
+In order to ingest metadata from Neo4j, you will need:
+
+* Neo4j instance with APOC installed
+
+
+### Install the Plugin(s)
+
+Run the following commands to install the relevant plugin(s):
+
+`pip install 'acryl-datahub[neo4j]'`
+
+
+### Configure the Ingestion Recipe(s)
+
+Use the following recipe(s) to get started with ingestion. 
+
+<details>
+  <summary>View All Recipe Configuartion Options</summary>
+
+  | Field              | Required |     Default     | Description                           |
+  |--------------------|:--------:|:---------------:|---------------------------------------|
+  | source             |          |                 |                                       |
+  | `type`             |    ✅     |     `neo4j`     | A required field with a default value |
+  | config             |          |                 |                                       |
+  | `uri`              |    ✅     | `default_value` | The URI for the Neo4j  server         |
+  | `username`         |    ✅     |      None       | Neo4j Username                        |
+  | `password`         |    ✅     |      None       | Neo4j Password
+  | `gms_server`       |    ✅     |      None       |Address for the gms server|
+  | `node_tag`         |    ❌     |      `Node`       |The tag that will be used to show that the Neo4j object is a Node|
+  | `relationship_tag` |    ❌     |  `Relationship`   |The tag that will be used to show that the Neo4j object is a Relationship|
+  | `environment`      |    ✅     |      None       ||
+  | sink               |          |                 ||
+  | `type`             |    ✅     |      None       ||
+  | conifg             |          |                 ||
+  | `server`           |    ✅     |      None       ||
+
+</details>
+
+
+```yml
+source:
+    type: 'neo4j'
+    config:
+        uri: 'neo4j+ssc://host:7687'
+        username: 'neo4j'
+        password: 'password'
+        gms_server: &gms_server 'http://localhost:8080'
+        node_tag: 'Node'
+        relationship_tag: 'Relationship'
+        environment: 'PROD'
+
+sink:
+  type: "datahub-rest"
+  config:
+    server: *gms_server
+```
+
+
+
+### Sample data that is returned from Neo4j.  This is the data that is parsed and used to create Nodes, Relationships.
+
+
+      Example relationship:
+        {
+        relationship_name: {
+            count: 1, 
+            properties: {}, 
+            type: "relationship"
+            }
+        }
+
+      Example node:
+        {
+        key: Neo4j_Node, 
+        value: {
+            count: 10, 
+            labels: [], 
+            properties: {
+                node_id: {
+                    unique: true, 
+                    indexed: true, 
+                    type: "STRING", 
+                    existence: false
+                    }, 
+                node_name: {
+                    unique: false, 
+                    indexed: false, 
+                    type: "STRING", 
+                    existence: false
+                    }
+                }, 
+            type: "node", 
+            relationships: {
+                RELATIONSHIP_1: {
+                    count: 10, 
+                    direction: "in", 
+                    labels: ["Node_1", "Node_2", "Node_3"], 
+                    properties: {
+                        relationsip_name: {
+                            indexed: false, 
+                            type: "STRING", 
+                            existence: false, 
+                            array: false
+                            }, 
+                        relationship_id: {
+                            indexed: false, 
+                            type: "INTEGER", 
+                            existence: false, 
+                            array: false
+                            }
+                        }
+                    }, 
+                RELATIONSHIP_2: {
+                    count: 10, 
+                    direction: "out", 
+                    labels: ["Node_4"], 
+                    properties: {
+                        relationship_name: {
+                            indexed: false, 
+                            type: "STRING", 
+                            existence: false, 
+                            array: false
+                            }, 
+                        relationship_id: {
+                            indexed: false, 
+                            type: "INTEGER", 
+                            existence: false, 
+                            array: false
+                            }
+                        }
+                    }
+                }
+            }
+        }
diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml
@@ -0,0 +1,15 @@
+source:
+    type: 'neo4j'
+    config:
+        uri: 'neo4j+ssc://host:7687'
+        username: 'neo4j'
+        password: 'password'
+        gms_server: 'http://localhost:8080'
+        node_tag: 'Node'
+        relationship_tag: 'Relationship'
+        environment: 'PROD'
+
+sink:
+  type: "datahub-rest"
+  config:
+    server: 'http://localhost:8080'
diff --git a/metadata-ingestion/examples/cli_usage/gen_schemas.py b/metadata-ingestion/examples/cli_usage/gen_schemas.py
@@ -28,7 +28,6 @@ class CorpGroupFile(BaseModel):
 
 
 with open("user/user.dhub.yaml_schema.json", "w") as fp:
-
     fp.write(json.dumps(CorpUserFile.schema(), indent=4))
 
 with open("group/group.dhub.yaml_schema.json", "w") as fp:

diff --git a/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml b/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml
@@ -0,0 +1,15 @@
+source:
+    type: 'neo4j'
+    config:
+        uri: 'neo4j+ssc://host:7687'
+        username: 'neo4j'
+        password: 'password'
+        gms_server: 'http://localhost:8080'
+        node_tag: 'Node'
+        relationship_tag: 'Relationship'
+        environment: 'PROD'
+
+sink:
+  type: "datahub-rest"
+  config:
+    server: 'http://localhost:8080'
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -321,6 +321,8 @@
     "Authlib",
 }
 
+neo4j = {"neo4j", "pandas"}
+
 # Note: for all of these, framework_common will be added.
 plugins: Dict[str, Set[str]] = {
     # Sink plugins.
@@ -488,6 +490,7 @@
     "qlik-sense": sqlglot_lib | {"requests", "websocket-client"},
     "sigma": sqlglot_lib | {"requests"},
     "sac": sac,
+    "neo4j": neo4j
 }
 
 # This is mainly used to exclude plugins from the Docker image.
@@ -630,6 +633,7 @@
             "qlik-sense",
             "sigma",
             "sac",
+            "neo4j"
         ]
         if plugin
         for dependency in plugins[plugin]
@@ -747,6 +751,7 @@
         "qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource",
         "sigma = datahub.ingestion.source.sigma.sigma:SigmaSource",
         "sac = datahub.ingestion.source.sac.sac:SACSource",
+        "neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource",
     ],
     "datahub.ingestion.transformer.plugins": [
         "pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership",

diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
@@ -70,7 +70,6 @@ def to_resource_info(self) -> models.PlatformResourceInfoClass:
 
 
 class OpenAPIGraphClient:
-
     ENTITY_KEY_ASPECT_MAP = {
         aspect_type.ASPECT_INFO.get("keyForEntity"): name
         for name, aspect_type in models.ASPECT_NAME_MAP.items()

diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
@@ -150,7 +150,6 @@ def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
 
     @classmethod
     def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
-
         structured_property: Optional[
             StructuredPropertyDefinitionClass
         ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)

diff --git a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py
@@ -32,7 +32,6 @@ def __str__(self):
 
 
 class S3ListIterator(Iterator):
-
     MAX_KEYS = 1000
 
     def __init__(

diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py
@@ -33,7 +33,6 @@
 
 @dataclass
 class ClassificationReportMixin:
-
     num_tables_fetch_sample_values_failed: int = 0
 
     num_tables_classification_attempted: int = 0
@@ -112,7 +111,6 @@ def classify_schema_fields(
         schema_metadata: SchemaMetadata,
         sample_data: Union[Dict[str, list], Callable[[], Dict[str, list]]],
     ) -> None:
-
         if not isinstance(sample_data, Dict):
             try:
                 # TODO: In future, sample_data fetcher can be lazily called if classification

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -339,7 +339,6 @@ class BigQueryV2Config(
     StatefulProfilingConfigMixin,
     ClassificationSourceConfigMixin,
 ):
-
     include_schema_metadata: bool = Field(
         default=True,
         description="Whether to ingest the BigQuery schema, i.e. projects, schemas, tables, and views.",

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
@@ -304,7 +304,6 @@ def _process_project(
                 project_id
             )
         except Exception as e:
-
             if self.config.project_ids and "not enabled BigQuery." in str(e):
                 action_mesage = (
                     "The project has not enabled BigQuery API. "
@@ -365,7 +364,6 @@ def _process_project_datasets(
         bigquery_project: BigqueryProject,
         db_tables: Dict[str, List[BigqueryTable]],
     ) -> Iterable[MetadataWorkUnit]:
-
         db_views: Dict[str, List[BigqueryView]] = {}
         db_snapshots: Dict[str, List[BigqueryTableSnapshot]] = {}
         project_id = bigquery_project.id
@@ -1004,7 +1002,6 @@ def get_tables_for_dataset(
     ) -> Iterable[BigqueryTable]:
         # In bigquery there is no way to query all tables in a Project id
         with PerfTimer() as timer:
-
             # PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables
             # based on Amazon S3 and Blob Storage data.
             # https://cloud.google.com/bigquery/docs/omni-introduction#limitations

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py
@@ -290,7 +290,6 @@ def get_workunits_internal(
     def deduplicate_queries(
         self, queries: FileBackedList[ObservedQuery]
     ) -> FileBackedDict[Dict[int, ObservedQuery]]:
-
         # This fingerprint based deduplication is done here to reduce performance hit due to
         # repetitive sql parsing while adding observed query to aggregator that would otherwise
         # parse same query multiple times. In future, aggregator may absorb this deduplication.
@@ -328,7 +327,6 @@ def deduplicate_queries(
         return queries_deduped
 
     def fetch_query_log(self, project: BigqueryProject) -> Iterable[ObservedQuery]:
-
         # Multi-regions from https://cloud.google.com/bigquery/docs/locations#supported_locations
         regions = self.config.region_qualifiers
 
@@ -341,7 +339,6 @@ def fetch_query_log(self, project: BigqueryProject) -> Iterable[ObservedQuery]:
     def fetch_region_query_log(
         self, project: BigqueryProject, region: str
     ) -> Iterable[ObservedQuery]:
-
         # Each region needs to be a different query
         query_log_query = _build_enriched_query_log_query(
             project_id=project.id,
@@ -435,7 +432,6 @@ def _build_enriched_query_log_query(
     start_time: datetime,
     end_time: datetime,
 ) -> str:
-
     audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT)
     audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT)
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py
@@ -371,7 +371,6 @@ def _get_schema_fields(
     def _get_schema_metadata(
         self, topic: str, platform_urn: str, is_subject: bool
     ) -> Optional[SchemaMetadata]:
-
         # Process the value schema
         schema, fields = self._get_schema_and_fields(
             topic=topic,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py
@@ -7,7 +7,6 @@
 
 
 class PathSpecsConfigMixin(ConfigModel):
-
     path_specs: List[PathSpec] = Field(
         description="List of PathSpec. See [below](#path-spec) the details about PathSpec"
     )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
@@ -96,7 +96,6 @@ def _get_database_workunits(
         )
         mcps = reader.get_aspects(from_createdon, self.report.stop_time)
         for i, (mcp, createdon) in enumerate(mcps):
-
             if not self.urn_pattern.allowed(str(mcp.entityUrn)):
                 continue
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py
@@ -235,7 +235,6 @@ def _process_table(
         table_name: str,
         dataset_name: str,
     ) -> Iterable[MetadataWorkUnit]:
-
         logger.debug(f"Processing table: {dataset_name}")
         table_info = dynamodb_client.describe_table(TableName=table_name)["Table"]
         account_id = table_info["TableArn"].split(":")[4]

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -307,7 +307,6 @@ def view_fields_from_dict(
         type_cls: ViewFieldType,
         populate_sql_logic_in_descriptions: bool,
     ) -> "ViewField":
-
         is_primary_key = field_dict.get("primary_key", "no") == "yes"
 
         name = field_dict["name"]
@@ -988,13 +987,11 @@ def from_api(  # noqa: C901
             field_name_vs_raw_explore_field: Dict = {}
 
             if explore.fields is not None:
-
                 if explore.fields.dimensions is not None:
                     for dim_field in explore.fields.dimensions:
                         if dim_field.name is None:
                             continue
                         else:
-
                             field_name_vs_raw_explore_field[dim_field.name] = dim_field
 
                             view_fields.append(
@@ -1035,7 +1032,6 @@ def from_api(  # noqa: C901
                         if measure_field.name is None:
                             continue
                         else:
-
                             field_name_vs_raw_explore_field[
                                 measure_field.name
                             ] = measure_field