Skip to content

Commit

Permalink
fix(ingest): added new transformer to cleanup suffix/prefix in owner …
Browse files Browse the repository at this point in the history
  • Loading branch information
dushayntAW authored Mar 22, 2024
1 parent 2248737 commit dd502ae
Show file tree
Hide file tree
Showing 4 changed files with 456 additions and 1 deletion.
20 changes: 19 additions & 1 deletion metadata-ingestion/docs/transformer/dataset_transformer.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ The below table shows transformer which can transform aspects of entity [Dataset
| Dataset Aspect | Transformer |
|---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `status` | - [Mark Dataset status](#mark-dataset-status) |
| `ownership` | - [Simple Add Dataset ownership](#simple-add-dataset-ownership)<br/> - [Pattern Add Dataset ownership](#pattern-add-dataset-ownership)<br/> - [Simple Remove Dataset Ownership](#simple-remove-dataset-ownership)<br/> - [Extract Ownership from Tags](#extract-ownership-from-tags) |
| `ownership` | - [Simple Add Dataset ownership](#simple-add-dataset-ownership)<br/> - [Pattern Add Dataset ownership](#pattern-add-dataset-ownership)<br/> - [Simple Remove Dataset Ownership](#simple-remove-dataset-ownership)<br/> - [Extract Ownership from Tags](#extract-ownership-from-tags)<br/> - [Clean suffix prefix from Ownership](#clean-suffix-prefix-from-ownership) |
| `globalTags` | - [Simple Add Dataset globalTags ](#simple-add-dataset-globaltags)<br/> - [Pattern Add Dataset globalTags](#pattern-add-dataset-globaltags)<br/> - [Add Dataset globalTags](#add-dataset-globaltags) |
| `browsePaths` | - [Set Dataset browsePath](#set-dataset-browsepath) |
| `glossaryTerms` | - [Simple Add Dataset glossaryTerms ](#simple-add-dataset-glossaryterms)<br/> - [Pattern Add Dataset glossaryTerms](#pattern-add-dataset-glossaryterms) |
Expand Down Expand Up @@ -38,6 +38,24 @@ transformers:
email_domain: "coolcompany.com"
```
## Clean suffix prefix from Ownership
### Config Details
| Field | Required | Type | Default | Description |
|-----------------------------|----------|---------|---------------|---------------------------------------------|
| `pattern_for_cleanup` | ✅ | list[string] | | List of suffix/prefix to remove from the Owner URN(s) |


Matches against a Onwer URN and remove the matching part from the Owner URN

```yaml
transformers:
- type: "pattern_cleanup_ownership"
config:
pattern_for_cleanup:
- "ABCDEF"
- (?<=_)(\w+)
```

## Mark Dataset Status
### Config Details
| Field | Required | Type | Default | Description |
Expand Down
1 change: 1 addition & 0 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@
"qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource",
],
"datahub.ingestion.transformer.plugins": [
"pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership",
"simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership",
"mark_dataset_status = datahub.ingestion.transformer.mark_dataset_status:MarkDatasetStatus",
"set_dataset_browse_path = datahub.ingestion.transformer.add_dataset_browse_path:AddDatasetBrowsePathTransformer",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import re
from typing import List, Optional, Set, cast

import datahub.emitter.mce_builder as builder
from datahub.configuration.common import ConfigModel
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.transformer.dataset_transformer import (
DatasetOwnershipTransformer,
)
from datahub.metadata.schema_classes import (
OwnerClass,
OwnershipClass,
OwnershipTypeClass,
)

_USER_URN_PREFIX: str = "urn:li:corpuser:"


class PatternCleanUpOwnershipConfig(ConfigModel):
pattern_for_cleanup: List[str]


class PatternCleanUpOwnership(DatasetOwnershipTransformer):
"""Transformer that clean the ownership URN."""

ctx: PipelineContext
config: PatternCleanUpOwnershipConfig

def __init__(self, config: PatternCleanUpOwnershipConfig, ctx: PipelineContext):
super().__init__()
self.ctx = ctx
self.config = config

@classmethod
def create(
cls, config_dict: dict, ctx: PipelineContext
) -> "PatternCleanUpOwnership":
config = PatternCleanUpOwnershipConfig.parse_obj(config_dict)
return cls(config, ctx)

def _get_current_owner_urns(self, entity_urn: str) -> Set[str]:
if self.ctx.graph is not None:
current_ownership = self.ctx.graph.get_ownership(entity_urn=entity_urn)
if current_ownership is not None:
current_owner_urns: Set[str] = set(
[owner.owner for owner in current_ownership.owners]
)
return current_owner_urns
else:
return set()
else:
return set()

def transform_aspect(
self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect]
) -> Optional[builder.Aspect]:
# get current owner URNs from the graph
current_owner_urns = self._get_current_owner_urns(entity_urn)

# clean all the owners based on the parameters received from config
cleaned_owner_urns: List[str] = []
for owner_urn in current_owner_urns:
user_id: str = owner_urn.split(_USER_URN_PREFIX)[1]
for value in self.config.pattern_for_cleanup:
user_id = re.sub(value, "", user_id)

cleaned_owner_urns.append(_USER_URN_PREFIX + user_id)

ownership_type, ownership_type_urn = builder.validate_ownership_type(
OwnershipTypeClass.DATAOWNER
)
owners = [
OwnerClass(
owner=owner,
type=ownership_type,
typeUrn=ownership_type_urn,
)
for owner in cleaned_owner_urns
]

out_ownership_aspect: OwnershipClass = OwnershipClass(
owners=[],
lastModified=None,
)

# generate the ownership aspect for the cleaned users
out_ownership_aspect.owners.extend(owners)
return cast(Optional[builder.Aspect], out_ownership_aspect)
Loading

0 comments on commit dd502ae

Please sign in to comment.