metadata-service/configuration/src/main/resources/application.yaml

# The base URL where DataHub is accessible to users.
baseUrl: ${DATAHUB_BASE_URL:http://localhost:9002}

# App Layer
authentication:
  # Enable if you want all requests to the Metadata Service to be authenticated.
  enabled: ${METADATA_SERVICE_AUTH_ENABLED:true}

  # Required if enabled is true! A configurable chain of Authenticators
  authenticators:
    # Required for authenticating requests with DataHub-issued Access Tokens - best not to remove.
    - type: com.datahub.authentication.authenticator.DataHubTokenAuthenticator
      configs:
        # Key used to validate incoming tokens. Should typically be the same as authentication.tokenService.signingKey
        signingKey: ${DATAHUB_TOKEN_SERVICE_SIGNING_KEY:WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94=}
        salt: ${DATAHUB_TOKEN_SERVICE_SALT:ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O=}
    # Required for unauthenticated health check endpoints - best not to remove.
    - type: com.datahub.authentication.authenticator.HealthStatusAuthenticator

  # Normally failures are only warnings, enable this to throw them.
  logAuthenticatorExceptions: ${METADATA_SERVICE_AUTHENTICATOR_EXCEPTIONS_ENABLED:false}

  # Required separately from the DataHubTokenAuthenticator as this is used by the AuthServiceController to authorize token generation
  # at user login time.
  systemClientId: ${DATAHUB_SYSTEM_CLIENT_ID:__datahub_system}
  systemClientSecret: ${DATAHUB_SYSTEM_CLIENT_SECRET:JohnSnowKnowsNothing}

  # Configurations for DataHub token service
  tokenService:
    # Key used to sign new tokens.
    signingKey: ${DATAHUB_TOKEN_SERVICE_SIGNING_KEY:WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94=}
    salt: ${DATAHUB_TOKEN_SERVICE_SALT:ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O=}
    issuer: ${DATAHUB_TOKEN_SERVICE_ISSUER:datahub-metadata-service}
    signingAlgorithm: ${DATAHUB_TOKEN_SERVICE_SIGNING_ALGORITHM:HS256}

  # The max duration of a UI session in milliseconds. Defaults to 1 day.
  sessionTokenDurationMs: ${SESSION_TOKEN_DURATION_MS:86400000}

# Authorization-related configurations.
authorization:
  # Configurations for the default DataHub policies-based authorizer.
  defaultAuthorizer:
    enabled: ${AUTH_POLICIES_ENABLED:true}
    cacheRefreshIntervalSecs: ${POLICY_CACHE_REFRESH_INTERVAL_SECONDS:120}
    cachePolicyFetchSize: ${POLICY_CACHE_FETCH_SIZE:1000}
  # Enables authorization of reads, writes, and deletes on REST APIs.
  restApiAuthorization: ${REST_API_AUTHORIZATION_ENABLED:true}
  view:
    enabled: ${VIEW_AUTHORIZATION_ENABLED:false}
    recommendations:
      # Currently limited to the actor only, see TODO: DataHubAuthorizer
      peerGroupEnabled: ${VIEW_AUTHORIZATION_RECOMMENDATIONS_PEER_GROUP_ENABLED:true}

ingestion:
  # The value of cliMajorVersion is substituted in by the processResources Gradle task.
  enabled: ${UI_INGESTION_ENABLED:true}
  defaultCliVersion: "${UI_INGESTION_DEFAULT_CLI_VERSION:@cliMajorVersion@}"
  maxSerializedStringLength: "${INGESTION_MAX_SERIALIZED_STRING_LENGTH:16000000}" # Indicates the maximum allowed JSON String length Jackson will handle, impacts the maximum size of ingested aspects

telemetry:
  enabledCli: ${CLI_TELEMETRY_ENABLED:true}
  enabledIngestion: ${INGESTION_REPORTING_ENABLED:false}
  enableThirdPartyLogging: ${ENABLE_THIRD_PARTY_LOGGING:false}
  enabledServer: ${DATAHUB_TELEMETRY_ENABLED:true}

secretService:
  encryptionKey: "#{systemEnvironment['SECRET_SERVICE_ENCRYPTION_KEY'] ?: 'ENCRYPTION_KEY'}"

datahub:
  serverType: ${DATAHUB_SERVER_TYPE:prod}
  gms:
    host: ${DATAHUB_GMS_HOST:localhost}
    port: ${DATAHUB_GMS_PORT:8080}
    useSSL: ${DATAHUB_GMS_USE_SSL:${GMS_USE_SSL:false}}
    async:
      request-timeout-ms: ${DATAHUB_GMS_ASYNC_REQUEST_TIMEOUT_MS:55000}

    # URI instead of above host/port/ssl
    # Priority is given to the URI setting over separate host/port/useSSL parameters
    uri: ${DATAHUB_GMS_URI:#{null}}

    sslContext:
      protocol: ${DATAHUB_GMS_SSL_PROTOCOL:${GMS_SSL_PROTOCOL:#{null}}}

  plugin:
    pluginSecurityMode: ${PLUGIN_SECURITY_MODE:RESTRICTED} # Possible value RESTRICTED or LENIENT, default to RESTRICTED
    entityRegistry:
      path: ${ENTITY_REGISTRY_PLUGIN_PATH:/etc/datahub/plugins/models}
      loadDelaySeconds: ${ENTITY_REGISTRY_PLUGIN_LOAD_DELAY_SECONDS:60} # Rate at which plugin runnable executes, will run every N seconds.
    retention:
      path: ${RETENTION_PLUGIN_PATH:/etc/datahub/plugins/retention}
    auth:
      path: ${AUTH_PLUGIN_PATH:/etc/datahub/plugins/auth}

entityService:
  impl: ${ENTITY_SERVICE_IMPL:ebean}
  retention:
    enabled: ${ENTITY_SERVICE_ENABLE_RETENTION:true}
    applyOnBootstrap: ${ENTITY_SERVICE_APPLY_RETENTION_BOOTSTRAP:false}

graphService:
  type: ${GRAPH_SERVICE_IMPL:elasticsearch}

searchService:
  resultBatchSize: ${SEARCH_SERVICE_BATCH_SIZE:100}
  enableCache: ${SEARCH_SERVICE_ENABLE_CACHE:false}
  cacheImplementation: ${SEARCH_SERVICE_CACHE_IMPLEMENTATION:caffeine}
  cache:
    hazelcast:
      serviceName: ${SEARCH_SERVICE_HAZELCAST_SERVICE_NAME:hazelcast-service}
  queryFilterRewriter:
    containerExpansion:
      enabled: ${SEARCH_SERVICE_FILTER_CONTAINER_EXPANSION_ENABLED:true}
      pageSize: ${SEARCH_SERVICE_FILTER_CONTAINER_EXPANSION_PAGE_SIZE:100}
      limit: ${SEARCH_SERVICE_FILTER_CONTAINER_EXPANSION_LIMIT:100}
    domainExpansion:
      enabled: ${SEARCH_SERVICE_FILTER_DOMAIN_EXPANSION_ENABLED:true}
      pageSize: ${SEARCH_SERVICE_FILTER_DOMAIN_EXPANSION_PAGE_SIZE:100}
      limit: ${SEARCH_SERVICE_FILTER_DOMAIN_EXPANSION_LIMIT:100}

configEntityRegistry:
  path: ${ENTITY_REGISTRY_CONFIG_PATH:../../metadata-models/src/main/resources/entity-registry.yml}
  # Priority is given to the `path` setting above (outside jar)
  resource: ${ENTITY_REGISTRY_CONFIG_CLASSPATH:classpath:/entity-registry.yml}

platformAnalytics:
  enabled: ${DATAHUB_ANALYTICS_ENABLED:true}

visualConfig:
  queriesTab:
    # Experimental! This env var is subject to change and may be deprecated in the future. The Queries tab has a larger
    # overhaul coming.
    queriesTabResultSize: ${REACT_APP_QUERIES_TAB_RESULT_SIZE:5}
  assets:
    logoUrl: ${REACT_APP_LOGO_URL:/assets/platforms/datahublogo.png}
    faviconUrl: ${REACT_APP_FAVICON_URL:/assets/icons/favicon.ico}
  appTitle: ${REACT_APP_TITLE:}
  hideGlossary: ${REACT_APP_HIDE_GLOSSARY:false}
  entityProfile:
    # we only support default tab for domains right now. In order to implement for other entities, update React code
    domainDefaultTab: ${DOMAIN_DEFAULT_TAB:} # set to DOCUMENTATION_TAB to show documentation tab first
  searchResult:
    enableNameHighlight: ${SEARCH_RESULT_NAME_HIGHLIGHT_ENABLED:true} # Enables visual highlighting on search result names/descriptions.

# Storage Layer

# Only required if entityService.impl is ebean
ebean:
  username: ${EBEAN_DATASOURCE_USERNAME:datahub}
  password: ${EBEAN_DATASOURCE_PASSWORD:datahub}
  url: ${EBEAN_DATASOURCE_URL:jdbc:mysql://localhost:3306/datahub} # JDBC URL
  driver: ${EBEAN_DATASOURCE_DRIVER:com.mysql.jdbc.Driver} # JDBC Driver
  minConnections: ${EBEAN_MIN_CONNECTIONS:2}
  maxConnections: ${EBEAN_MAX_CONNECTIONS:50}
  maxInactiveTimeSeconds: ${EBEAN_MAX_INACTIVE_TIME_IN_SECS:120}
  maxAgeMinutes: ${EBEAN_MAX_AGE_MINUTES:120}
  leakTimeMinutes: ${EBEAN_LEAK_TIME_MINUTES:15}
  waitTimeoutMillis: ${EBEAN_WAIT_TIMEOUT_MILLIS:1000}
  autoCreateDdl: ${EBEAN_AUTOCREATE:false}
  postgresUseIamAuth: ${EBEAN_POSTGRES_USE_AWS_IAM_AUTH:false}
  locking:
    enabled: ${EBEAN_LOCKING_ENABLED:true}
    durationSeconds: ${EBEAN_LOCKING_DURATION_SECONDS:60}
    maximumLocks: ${EBEAN_LOCKING_MAXIMUM_LOCKS:20000}

# Only required if entityService.impl is cassandra
cassandra:
  datasourceUsername: ${CASSANDRA_DATASOURCE_USERNAME:cassandra}
  datasourcePassword: ${CASSANDRA_DATASOURCE_PASSWORD:cassandra}
  hosts: ${CASSANDRA_HOSTS:cassandra}
  port: ${CASSANDRA_PORT:9042}
  datacenter: ${CASSANDRA_DATACENTER:datacenter1}
  keyspace: ${CASSANDRA_KEYSPACE:datahub}
  useSsl: ${CASSANDRA_USE_SSL:false}

elasticsearch:
  host: ${ELASTICSEARCH_HOST:localhost}
  port: ${ELASTICSEARCH_PORT:9200}
  threadCount: ${ELASTICSEARCH_THREAD_COUNT:2}
  connectionRequestTimeout: ${ELASTICSEARCH_CONNECTION_REQUEST_TIMEOUT:5000}
  username: ${ELASTICSEARCH_USERNAME:#{null}}
  password: ${ELASTICSEARCH_PASSWORD:#{null}}
  pathPrefix: ${ELASTICSEARCH_PATH_PREFIX:#{null}}
  useSSL: ${ELASTICSEARCH_USE_SSL:false}
  opensearchUseAwsIamAuth: ${OPENSEARCH_USE_AWS_IAM_AUTH:false}
  region: ${AWS_REGION:#{null}}
  implementation: ${ELASTICSEARCH_IMPLEMENTATION:elasticsearch} # elasticsearch or opensearch, for handling divergent cases
  idHashAlgo: ${ELASTIC_ID_HASH_ALGO:MD5}
  sslContext: # Required if useSSL is true
    protocol: ${ELASTICSEARCH_SSL_PROTOCOL:#{null}}
    secureRandomImplementation: ${ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL:#{null}}
    trustStoreFile: ${ELASTICSEARCH_SSL_TRUSTSTORE_FILE:#{null}}
    trustStoreType: ${ELASTICSEARCH_SSL_TRUSTSTORE_TYPE:#{null}}
    trustStorePassword: ${ELASTICSEARCH_SSL_TRUSTSTORE_PASSWORD:#{null}}
    keyStoreFile: ${ELASTICSEARCH_SSL_KEYSTORE_FILE:#{null}}
    keyStoreType: ${ELASTICSEARCH_SSL_KEYSTORE_TYPE:#{null}}
    keyStorePassword: ${ELASTICSEARCH_SSL_KEYSTORE_PASSWORD:#{null}}
    keyPassword: ${ELASTICSEARCH_SSL_KEY_PASSWORD:#{null}}
  bulkProcessor:
    async: ${ES_BULK_ASYNC:true}
    requestsLimit: ${ES_BULK_REQUESTS_LIMIT:1000}
    flushPeriod: ${ES_BULK_FLUSH_PERIOD:1}
    numRetries: ${ES_BULK_NUM_RETRIES:3}
    retryInterval: ${ES_BULK_RETRY_INTERVAL:1}
    refreshPolicy: ${ES_BULK_REFRESH_POLICY:NONE}
    enableBatchDelete: ${ES_BULK_ENABLE_BATCH_DELETE:false}
  index:
    prefix: ${INDEX_PREFIX:}
    numShards: ${ELASTICSEARCH_NUM_SHARDS_PER_INDEX:1}
    numReplicas: ${ELASTICSEARCH_NUM_REPLICAS_PER_INDEX:1}
    numRetries: ${ELASTICSEARCH_INDEX_BUILDER_NUM_RETRIES:3}
    refreshIntervalSeconds: ${ELASTICSEARCH_INDEX_BUILDER_REFRESH_INTERVAL_SECONDS:1} # increase to 30 if expected indexing rates to be greater than 100/s
    maxArrayLength: ${SEARCH_DOCUMENT_MAX_ARRAY_LENGTH:1000}
    maxObjectKeys: ${SEARCH_DOCUMENT_MAX_OBJECT_KEYS:1000}
    maxValueLength: ${SEARCH_DOCUMENT_MAX_VALUE_LENGTH:4096} # i.e. customProperty values
    mainTokenizer: ${ELASTICSEARCH_MAIN_TOKENIZER:#{null}}
    enableMappingsReindex: ${ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX:false}
    enableSettingsReindex: ${ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX:false}
    maxReindexHours: ${ELASTICSEARCH_INDEX_BUILDER_MAX_REINDEX_HOURS:0} # <= 0 - no timeout
    settingsOverrides: ${ELASTICSEARCH_INDEX_BUILDER_SETTINGS_OVERRIDES:#{null}}
    entitySettingsOverrides: ${ELASTICSEARCH_INDEX_BUILDER_ENTITY_SETTINGS_OVERRIDES:#{null}}
    docIds:
      schemaField:
        hashIdEnabled: ${ELASTICSEARCH_INDEX_DOC_IDS_SCHEMA_FIELD_HASH_ID_ENABLED:false}
  buildIndices:
    allowDocCountMismatch: ${ELASTICSEARCH_BUILD_INDICES_ALLOW_DOC_COUNT_MISMATCH:false} # when cloneIndices is also enabled
    cloneIndices: ${ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES:true}
    retentionUnit: ${ELASTICSEARCH_BUILD_INDICES_RETENTION_UNIT:DAYS}
    retentionValue: ${ELASTICSEARCH_BUILD_INDICES_RETENTION_VALUE:60}
  search:
    maxTermBucketSize: ${ELASTICSEARCH_QUERY_MAX_TERM_BUCKET_SIZE:20}
    # Defines the behavior of quoted searches, do they apply weights or exclude results
    exactMatch:
      exclusive: ${ELASTICSEARCH_QUERY_EXACT_MATCH_EXCLUSIVE:false} # if false will only apply weights, if true will exclude non-exact
      withPrefix: ${ELASTICSEARCH_QUERY_EXACT_MATCH_WITH_PREFIX:true} # include prefix exact matches
      exactFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_FACTOR:16.0} # boost multiplier when exact with case
      prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.1} # boost multiplier when exact prefix
      caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.0} # stacked boost multiplier when case mismatch
      enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search
    wordGram:
      twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens
      threeGramFactor: ${ELASTICSEARCH_QUERY_THREE_GRAM_FACTOR:1.5} # boost multiplier when match on 3-gram tokens
      fourGramFactor: ${ELASTICSEARCH_QUERY_FOUR_GRAM_FACTOR:1.8} # boost multiplier when match on 4-gram tokens
    # Field weight annotations are typically calibrated for exact match, if partial match is possible on the field use these adjustments
    partial:
      urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed
      factor: ${ELASTICSEARCH_QUERY_PARTIAL_FACTOR:0.4} # multiplier on possible non-Urn token match
    custom:
      enabled: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED:true}
      file: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:search_config.yaml}
    graph:
      timeoutSeconds: ${ELASTICSEARCH_SEARCH_GRAPH_TIMEOUT_SECONDS:50} # graph dao timeout seconds
      batchSize: ${ELASTICSEARCH_SEARCH_GRAPH_BATCH_SIZE:1000} # graph dao batch size
      maxResult: ${ELASTICSEARCH_SEARCH_GRAPH_MAX_RESULT:10000} # graph dao max result size
      enableMultiPathSearch: ${ELASTICSEARCH_SEARCH_GRAPH_MULTI_PATH_SEARCH:false} # allows a path to be retraversed to walk all paths to the node instead of just shortest, avoids cycles by not rewalking the visited edge
      boostViaNodes: ${ELASTICSEARCH_SEARCH_GRAPH_BOOST_VIA_NODES:true} # adds a boosting query that ranks graph edges with via nodes higher, used to allow via paths to be prioritized when multi path search is disabled
      graphStatusEnabled: ${ELASTICSEARCH_SEARCH_GRAPH_STATUS_ENABLED:true} # enable soft delete tracking of the urns on edges

# TODO: Kafka topic convention
kafka:
  listener:
    concurrency: ${KAFKA_LISTENER_CONCURRENCY:1}
  bootstrapServers: ${KAFKA_BOOTSTRAP_SERVER:http://localhost:9092}
  serde:
    usageEvent:
      key:
        serializer: ${KAFKA_SERDE_USAGE_EVENTS_KEY_SERIALIZER:org.apache.kafka.common.serialization.StringSerializer}
        deserializer: ${KAFKA_SERDE_USAGE_EVENTS_KEY_DESERIALIZER:org.apache.kafka.common.serialization.StringDeserializer}
      value:
        serializer: ${KAFKA_SERDE_USAGE_EVENTS_VALUE_SERIALIZER:org.apache.kafka.common.serialization.StringSerializer}
        deserializer: ${KAFKA_SERDE_USAGE_EVENTS_VALUE_DESERIALIZER:org.apache.kafka.common.serialization.StringDeserializer}
    event:
      key:
        serializer: ${KAFKA_SERDE_EVENT_KEY_SERIALIZER:org.apache.kafka.common.serialization.StringSerializer}
        deserializer: ${KAFKA_SERDE_EVENT_KEY_DESERIALIZER:org.springframework.kafka.support.serializer.ErrorHandlingDeserializer}
        delegateDeserializer: ${KAFKA_SERDE_EVENT_KEY_DELEGATE_DESERIALIZER:org.apache.kafka.common.serialization.StringDeserializer}
      value:
        serializer: ${KAFKA_SERDE_EVENT_VALUE_SERIALIZER:io.confluent.kafka.serializers.KafkaAvroSerializer}
        deserializer: ${KAFKA_SERDE_EVENT_VALUE_DESERIALIZER:org.springframework.kafka.support.serializer.ErrorHandlingDeserializer}
        delegateDeserializer: ${KAFKA_SERDE_EVENT_VALUE_DELEGATE_DESERIALIZER:io.confluent.kafka.serializers.KafkaAvroDeserializer}
  producer:
    retryCount: ${KAFKA_PRODUCER_RETRY_COUNT:3}
    deliveryTimeout: ${KAFKA_PRODUCER_DELIVERY_TIMEOUT:30000}
    requestTimeout: ${KAFKA_PRODUCER_REQUEST_TIMEOUT:3000}
    backoffTimeout: ${KAFKA_PRODUCER_BACKOFF_TIMEOUT:500}
    compressionType: ${KAFKA_PRODUCER_COMPRESSION_TYPE:snappy} # producer's compression algorithm
    maxRequestSize: ${KAFKA_PRODUCER_MAX_REQUEST_SIZE:5242880} # the max bytes sent by the producer, also see kafka-setup MAX_MESSAGE_BYTES for matching value
  consumer:
    maxPartitionFetchBytes: ${KAFKA_CONSUMER_MAX_PARTITION_FETCH_BYTES:5242880} # the max bytes consumed per partition
    stopOnDeserializationError: ${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:true} # Stops kafka listener container on deserialization error, allows user to fix problems before moving past problematic offset. If false will log and move forward past the offset
    healthCheckEnabled: ${KAFKA_CONSUMER_HEALTH_CHECK_ENABLED:true} # Sets the health indicator to down when a message listener container has stopped due to a deserialization failure, will force consumer apps to restart through k8s and docker-compose health mechanisms
  schemaRegistry:
    type: ${SCHEMA_REGISTRY_TYPE:KAFKA} # INTERNAL or KAFKA or AWS_GLUE
    url: ${KAFKA_SCHEMAREGISTRY_URL:http://localhost:8081}
    awsGlue:
      region: ${AWS_GLUE_SCHEMA_REGISTRY_REGION:us-east-1}
      registryName: ${AWS_GLUE_SCHEMA_REGISTRY_NAME:#{null}}
  schema:
    registry:
      security:
        protocol: ${KAFKA_PROPERTIES_SECURITY_PROTOCOL:PLAINTEXT}

# Only required if GraphService type is neo4j
neo4j:
  username: ${NEO4J_USERNAME:neo4j}
  password: ${NEO4J_PASSWORD:datahub}
  uri: ${NEO4J_URI:bolt://localhost}
  database: ${NEO4J_DATABASE:graph.db}
  maxConnectionPoolSize: ${NEO4J_MAX_CONNECTION_POOL_SIZE:100}
  maxConnectionAcquisitionTimeout: ${NEO4J_MAX_CONNECTION_ACQUISITION_TIMEOUT_IN_SECONDS:60}
  maxConnectionLifetimeInSeconds: ${NEO4j_MAX_CONNECTION_LIFETIME_IN_SECONDS:3600}
  maxTransactionRetryTime: ${NEO4J_MAX_TRANSACTION_RETRY_TIME_IN_SECONDS:30}
  connectionLivenessCheckTimeout: ${NEO4J_CONNECTION_LIVENESS_CHECK_TIMEOUT_IN_SECONDS:-1}

spring:
  mvc:
    servlet:
      path: /openapi
  kafka:
    security:
      protocol: ${KAFKA_PROPERTIES_SECURITY_PROTOCOL:PLAINTEXT}

springdoc:
  cache:
    disabled: true

metadataTests:
  enabled: ${METADATA_TESTS_ENABLED:false}

siblings:
  enabled: ${ENABLE_SIBLING_HOOK:true} # enable to turn on automatic sibling associations for dbt
  consumerGroupSuffix: ${SIBLINGS_HOOK_CONSUMER_GROUP_SUFFIX:}
updateIndices:
  enabled: ${ENABLE_UPDATE_INDICES_HOOK:true}
  consumerGroupSuffix: ${UPDATE_INDICES_CONSUMER_GROUP_SUFFIX:}
ingestionScheduler:
  enabled: ${ENABLE_INGESTION_SCHEDULER_HOOK:true} # enable to execute ingestion scheduling
  consumerGroupSuffix: ${INGESTION_SCHEDULER_HOOK_CONSUMER_GROUP_SUFFIX:}
incidents:
  hook:
    enabled: ${ENABLE_INCIDENTS_HOOK:true}
    maxIncidentHistory: ${MAX_INCIDENT_HISTORY:100}
    consumerGroupSuffix: ${INCIDENTS_HOOK_CONSUMER_GROUP_SUFFIX:}

bootstrap:
  policies:
    file: ${BOOTSTRAP_POLICIES_FILE:classpath:boot/policies.json}
    # eg for local file
    # file: "file:///datahub/datahub-gms/resources/custom-policies.json"
  servlets:
    waitTimeout: ${BOOTSTRAP_SERVLETS_WAITTIMEOUT:60} # Total waiting time in seconds for servlets to initialize

systemUpdate:
  initialBackOffMs: ${BOOTSTRAP_SYSTEM_UPDATE_INITIAL_BACK_OFF_MILLIS:5000}
  maxBackOffs: ${BOOTSTRAP_SYSTEM_UPDATE_MAX_BACK_OFFS:50}
  backOffFactor: ${BOOTSTRAP_SYSTEM_UPDATE_BACK_OFF_FACTOR:2} # Multiplicative factor for back off, default values will result in waiting 5min 15s
  waitForSystemUpdate: ${BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE:true}
  bootstrap:
    mcpConfig: ${SYSTEM_UPDATE_BOOTSTRAP_MCP_CONFIG:bootstrap_mcps.yaml}
  dataJobNodeCLL:
    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_DATA_JOB_NODE_CLL_ENABLED:false}
    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_DATA_JOB_NODE_CLL_BATCH_SIZE:1000}
    delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_DATA_JOB_NODE_CLL_DELAY_MS:30000}
    limit: ${BOOTSTRAP_SYSTEM_UPDATE_DATA_JOB_NODE_CLL_LIMIT:0}
  domainDescription:
    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_DOMAIN_DESCRIPTION_ENABLED:true}
    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_DOMAIN_DESCRIPTION_BATCH_SIZE:1000}
    delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_DOMAIN_DESCRIPTION_DELAY_MS:30000}
    limit: ${BOOTSTRAP_SYSTEM_UPDATE_DOMAIN_DESCRIPTION_CLL_LIMIT:0}
  browsePathsV2:
    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_BROWSE_PATHS_V2_ENABLED:true}
    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_BROWSE_PATHS_V2_BATCH_SIZE:5000}
    reprocess:
      enabled: ${REPROCESS_DEFAULT_BROWSE_PATHS_V2:false}
  policyFields:
    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_POLICY_FIELDS_ENABLED:true}
    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_POLICY_FIELDS_BATCH_SIZE:5000}
    reprocess:
      enabled: ${REPROCESS_DEFAULT_POLICY_FIELDS:false}
  ownershipTypes:
    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_OWNERSHIP_TYPES_ENABLED:true}
    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_OWNERSHIP_TYPES_BATCH_SIZE:1000}
    reprocess:
      enabled: ${BOOTSTRAP_SYSTEM_UPDATE_OWNERSHIP_TYPES_REPROCESS:false}
  schemaFieldsFromSchemaMetadata:
    enabled: ${SYSTEM_UPDATE_SCHEMA_FIELDS_FROM_SCHEMA_METADATA_ENABLED:false}
    batchSize: ${SYSTEM_UPDATE_SCHEMA_FIELDS_FROM_SCHEMA_METADATA_BATCH_SIZE:500}
    delayMs: ${SYSTEM_UPDATE_SCHEMA_FIELDS_FROM_SCHEMA_METADATA_DELAY_MS:1000}
    limit: ${SYSTEM_UPDATE_SCHEMA_FIELDS_FROM_SCHEMA_METADATA_LIMIT:0}
  schemaFieldsDocIds:
    enabled: ${SYSTEM_UPDATE_SCHEMA_FIELDS_DOC_IDS_ENABLED:false}
    batchSize: ${SYSTEM_UPDATE_SCHEMA_FIELDS_DOC_IDS_BATCH_SIZE:500}
    delayMs: ${SYSTEM_UPDATE_SCHEMA_FIELDS_DOC_IDS_DELAY_MS:5000}
    limit: ${SYSTEM_UPDATE_SCHEMA_FIELDS_DOC_IDS_LIMIT:0}
  processInstanceHasRunEvents:
    enabled: ${SYSTEM_UPDATE_PROCESS_INSTANCE_HAS_RUN_EVENTS_ENABLED:true}
    batchSize: ${SYSTEM_UPDATE_PROCESS_INSTANCE_HAS_RUN_EVENTS_BATCH_SIZE:100}
    delayMs: ${SYSTEM_UPDATE_PROCESS_INSTANCE_HAS_RUN_EVENTS_DELAY_MS:1000}
    totalDays: ${SYSTEM_UPDATE_PROCESS_INSTANCE_HAS_RUN_EVENTS_TOTAL_DAYS:90}
    windowDays: ${SYSTEM_UPDATE_PROCESS_INSTANCE_HAS_RUN_EVENTS_WINDOW_DAYS:1}
    reprocess:
      enabled: ${SYSTEM_UPDATE_PROCESS_INSTANCE_HAS_RUN_EVENTS_REPROCESS:false}
  edgeStatus:
    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_EDGE_STATUS_ENABLED:true}
    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_EDGE_STATUS_BATCH_SIZE:1000}
    delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_EDGE_STATUS_DELAY_MS:5000}
    limit: ${BOOTSTRAP_SYSTEM_UPDATE_EDGE_STATUS_LIMIT:0}
      
structuredProperties:
  enabled: ${ENABLE_STRUCTURED_PROPERTIES_HOOK:true} # applies structured properties mappings
  writeEnabled: ${ENABLE_STRUCTURED_PROPERTIES_WRITE:true} # write structured property values
  systemUpdateEnabled: ${ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE:false} # applies structured property mappings in system update job

healthCheck:
  cacheDurationSeconds: ${HEALTH_CHECK_CACHE_DURATION_SECONDS:5}

featureFlags:
  showSimplifiedHomepageByDefault: ${SHOW_SIMPLIFIED_HOMEPAGE_BY_DEFAULT:false} # shows a simplified homepage with just datasets, charts and dashboards by default to users. this can be configured in user settings
  lineageSearchCacheEnabled: ${LINEAGE_SEARCH_CACHE_ENABLED:true} # Enables in-memory cache for searchAcrossLineage query
  graphServiceDiffModeEnabled: ${GRAPH_SERVICE_DIFF_MODE_ENABLED:true} # Enables diff mode for graph writes, uses a different code path that produces a diff from previous to next to write relationships instead of wholesale deleting edges and reading
  pointInTimeCreationEnabled: ${POINT_IN_TIME_CREATION_ENABLED:false} # Enables creation of point in time snapshots for the scroll API, only works with main line ElasticSearch releases after 7.10. OpenSearch is unsupported, plans to eventually target OpenSearch 2.4+ with a divergent client
  alwaysEmitChangeLog: ${ALWAYS_EMIT_CHANGE_LOG:false} # Enables always emitting a MCL even when no changes are detected. Used for Time Based Lineage when no changes occur.
  searchServiceDiffModeEnabled: ${SEARCH_SERVICE_DIFF_MODE_ENABLED:true} # Enables diff mode for search document writes, reduces amount of writes to ElasticSearch documents for no-ops
  readOnlyModeEnabled: ${READ_ONLY_MODE_ENABLED:false} # Enables read only mode for an instance. Right now this only affects ability to edit user profile image URL but can be extended
  showAccessManagement: ${SHOW_ACCESS_MANAGEMENT:false} #Whether we should show AccessManagement tab in the datahub UI.
  showSearchFiltersV2: ${SHOW_SEARCH_FILTERS_V2:true} # Enables showing the search filters V2 experience.
  showBrowseV2: ${SHOW_BROWSE_V2:true} # Enables showing the browse v2 sidebar experience.
  platformBrowseV2: ${PLATFORM_BROWSE_V2:false} # Enables the platform browse experience, instead of the entity-oriented browse default.
  preProcessHooks:
    uiEnabled: ${PRE_PROCESS_HOOKS_UI_ENABLED:true} # Circumvents Kafka for processing index updates for UI changes sourced from GraphQL to avoid processing delays
    reprocessEnabled: ${PRE_PROCESS_HOOKS_UI_ENABLED:false} # If enabled, will reprocess UI sourced events asynchronously when reading from Kafka after pre-processing them synchronously
  showAcrylInfo: ${SHOW_ACRYL_INFO:false} # Show different CTAs within DataHub around moving to DataHub Cloud. Set to true for the demo site.
  erModelRelationshipFeatureEnabled: ${ER_MODEL_RELATIONSHIP_FEATURE_ENABLED:false} # Enable Join Tables Feature and show within Dataset view as Relations
  nestedDomainsEnabled: ${NESTED_DOMAINS_ENABLED:true} # Enables the nested Domains feature that allows users to have sub-Domains. If this is off, Domains appear "flat" again
  schemaFieldEntityFetchEnabled: ${SCHEMA_FIELD_ENTITY_FETCH_ENABLED:true} # Enables fetching for schema field entities from the database when we hydrate them on schema fields
  businessAttributeEntityEnabled: ${BUSINESS_ATTRIBUTE_ENTITY_ENABLED:false} # Enables business attribute entity which can be associated with field of dataset
  dataContractsEnabled: ${DATA_CONTRACTS_ENABLED:true} # Enables the Data Contracts feature (Tab) in the UI
  alternateMCPValidation: ${ALTERNATE_MCP_VALIDATION:false} # Enables alternate MCP validation flow
  showSeparateSiblings: ${SHOW_SEPARATE_SIBLINGS:false} # If turned on, all siblings will be separated with no way to get to a "combined" sibling view
  editableDatasetNameEnabled: ${EDITABLE_DATASET_NAME_ENABLED:false} # Enables the ability to edit the dataset name in the UI

entityChangeEvents:
  enabled: ${ENABLE_ENTITY_CHANGE_EVENTS_HOOK:true}
  consumerGroupSuffix: ${ECE_CONSUMER_GROUP_SUFFIX:}

views:
  enabled: ${VIEWS_ENABLED:true}

entityClient:
  retryInterval: ${ENTITY_CLIENT_RETRY_INTERVAL:2}
  numRetries: ${ENTITY_CLIENT_NUM_RETRIES:3}
  java:
    get:
      batchSize: ${ENTITY_CLIENT_JAVA_GET_BATCH_SIZE:375} # matches EbeanAspectDao batch size
  restli:
    get:
      batchSize: ${ENTITY_CLIENT_RESTLI_GET_BATCH_SIZE:100} # limited to prevent exceeding restli URI size limit
      batchConcurrency: ${ENTITY_CLIENT_RESTLI_GET_BATCH_CONCURRENCY:2} # parallel threads

usageClient:
  retryInterval: ${USAGE_CLIENT_RETRY_INTERVAL:2}
  numRetries: ${USAGE_CLIENT_NUM_RETRIES:0}
  timeoutMs: ${USAGE_CLIENT_TIMEOUT_MS:3000}

cache:
  primary:
    ttlSeconds: ${CACHE_TTL_SECONDS:600}
    maxSize: ${CACHE_MAX_SIZE:10000}
  homepage:
    entityCounts:
      ttlSeconds: ${CACHE_ENTITY_COUNTS_TTL_SECONDS:600}
  search:
    lineage:
      ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day
      lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300}
  client:
    usageClient:
      enabled: ${CACHE_CLIENT_USAGE_CLIENT_ENABLED:true}
      statsEnabled: ${CACHE_CLIENT_USAGE_CLIENT_STATS_ENABLED:true}
      statsIntervalSeconds: ${CACHE_CLIENT_USAGE_CLIENT_STATS_INTERVAL_SECONDS:120}
      defaultTTLSeconds: ${CACHE_CLIENT_USAGE_CLIENT_TTL_SECONDS:86400} # 1 day
      maxBytes: ${CACHE_CLIENT_USAGE_CLIENT_MAX_BYTES:52428800} # 50MB
    entityClient:
      enabled: ${CACHE_CLIENT_ENTITY_CLIENT_ENABLED:true}
      statsEnabled: ${CACHE_CLIENT_ENTITY_CLIENT_STATS_ENABLED:true}
      statsIntervalSeconds: ${CACHE_CLIENT_ENTITY_CLIENT_STATS_INTERVAL_SECONDS:120}
      defaultTTLSeconds: ${CACHE_CLIENT_ENTITY_CLIENT_TTL_SECONDS:0} # do not cache entity/aspects by default
      maxBytes: ${CACHE_CLIENT_ENTITY_CLIENT_MAX_BYTES:104857600} # 100MB
      entityAspectTTLSeconds:
        # cache user aspects for 20s
        corpuser:
          corpUserKey: 20
          corpUserInfo: 20
          corpUserEditableInfo: 20
          corpUserStatus: 20
          globalTags: 20
          status: 20
          corpUserCredentials: 20
          corpUserSettings: 20
          roleMembership: 20
          groupMembership: 20
          nativeGroupMembership: 20
        structuredProperty:
          status: 300 # 5 min
          propertyDefinition: 300 # 5 min
          structuredPropertyKey: 86400 # 1 day
        chart:
          usageFeatures: 21600 # 6hrs
        dataset:
          usageFeatures: 21600 # 6hrs
          storageFeatures: 21600 # 6hrs
        dashboard:
          dashboardUsageStatistics: 21600 # 6hrs

graphQL:
  concurrency:
    separateThreadPool: ${GRAPHQL_CONCURRENCY_SEPARATE_THREAD_POOL:false} # Enable the separate thread pool, the following configurations only apply if enabled
    stackSize: ${GRAPHQL_CONCURRENCY_STACK_SIZE:256000} # Default to JVM default of 256 KB
    corePoolSize: ${GRAPHQL_CONCURRENCY_CORE_POOL_SIZE:-1} # Base thread pool size for GraphQL executor service, default 5 * # of cores
    maxPoolSize: ${GRAPHQL_CONCURRENCY_MAX_POOL_SIZE:-1} # Maximum thread pool size for GraphQL executor service, default 100 * # of cores
    keepAlive: ${GRAPHQL_CONCURRENCY_KEEP_ALIVE:60} # Number of seconds to keep inactive threads alive
  query:
    complexityLimit: ${GRAPHQL_QUERY_COMPLEXITY_LIMIT:2000}
    depthLimit: ${GRAPHQL_QUERY_DEPTH_LIMIT:50}
    introspectionEnabled: ${GRAPHQL_QUERY_INTROSPECTION_ENABLED:true}

springdoc.api-docs.groups.enabled: true

forms:
  hook:
    enabled: { $FORMS_HOOK_ENABLED:true }
    consumerGroupSuffix: ${FORMS_HOOK_CONSUMER_GROUP_SUFFIX:}

businessAttribute:
  fetchRelatedEntitiesCount: ${BUSINESS_ATTRIBUTE_RELATED_ENTITIES_COUNT:20000}
  fetchRelatedEntitiesBatchSize: ${BUSINESS_ATTRIBUTE_RELATED_ENTITIES_BATCH_SIZE:1000}
  threadCount: ${BUSINESS_ATTRIBUTE_PROPAGATION_CONCURRENCY_THREAD_COUNT:-1} # Thread Pool size, default 2 * # of cores
  keepAliveTime: ${BUSINESS_ATTRIBUTE_PROPAGATION_CONCURRENCY_KEEP_ALIVE:60} # Number of seconds to keep inactive threads alive

metadataChangeProposal:
  validation:
    ignoreUnknown: ${MCP_VALIDATION_IGNORE_UNKNOWN:true}
    extensions:
      enabled: ${MCP_VALIDATION_EXTENSIONS_ENABLED:false}
  sideEffects:
    schemaField:
      enabled: ${MCP_SIDE_EFFECTS_SCHEMA_FIELD_ENABLED:false}
    dataProductUnset:
      enabled: ${MCP_SIDE_EFFECTS_DATA_PRODUCT_UNSET_ENABLED:true}
  throttle:
    updateIntervalMs: ${MCP_THROTTLE_UPDATE_INTERVAL_MS:60000}

    # What component is throttled
    components:
      mceConsumer:
        enabled: ${MCP_MCE_CONSUMER_THROTTLE_ENABLED:false}
      apiRequests:
        enabled: ${MCP_API_REQUESTS_THROTTLE_ENABLED:false}

    # How is it throttled
    # Versioned MCL topic settings
    versioned:
      # Whether to monitor MCL versioned backlog
      enabled: ${MCP_VERSIONED_THROTTLE_ENABLED:false}
      threshold: ${MCP_VERSIONED_THRESHOLD:4000} # throttle threshold
      maxAttempts: ${MCP_VERSIONED_MAX_ATTEMPTS:1000}
      initialIntervalMs: ${MCP_VERSIONED_INITIAL_INTERVAL_MS:100}
      multiplier: ${MCP_VERSIONED_MULTIPLIER:10}
      maxIntervalMs: ${MCP_VERSIONED_MAX_INTERVAL_MS:30000}

    # Timeseries MCL topic settings
    timeseries:
      # Whether to monitor MCL timeseries backlog
      enabled: ${MCP_TIMESERIES_THROTTLE_ENABLED:false}
      threshold: ${MCP_TIMESERIES_THRESHOLD:4000} # throttle threshold
      maxAttempts: ${MCP_TIMESERIES_MAX_ATTEMPTS:1000}
      initialIntervalMs: ${MCP_TIMESERIES_INITIAL_INTERVAL_MS:100}
      multiplier: ${MCP_TIMESERIES_MULTIPLIER:10}
      maxIntervalMs:  ${MCP_TIMESERIES_MAX_INTERVAL_MS:30000}