class datahub.metadata.schema_classes.AccessClass(roles=None)

Bases: _Aspect

Aspect used for associating roles to a dataset or any asset

Parameters:

roles (Optional[List[RoleAssociationClass]])

property roles: None | List[RoleAssociationClass]

List of Roles which needs to be associated

class datahub.metadata.schema_classes.AccessLevelClass

Bases: object

The various access levels

PRIVATE = 'PRIVATE'
PUBLIC = 'PUBLIC'

Private availability to certain set of users

class datahub.metadata.schema_classes.ActorsClass(users=None)

Bases: _Aspect

Provisioned users of a role

Parameters:

users (Optional[List[RoleUserClass]])

property users: None | List[RoleUserClass]

List of provisioned users of a role

class datahub.metadata.schema_classes.ArrayTypeClass(nestedType=None)

Bases: DictWrapper

Array field type.

Parameters:

nestedType (Optional[List[str]])

property nestedType: None | List[str]

List of types this array holds.

class datahub.metadata.schema_classes.AspectBag(_typename, _fields=None, /, **kwargs)

Bases: dict

access: AccessClass
actors: ActorsClass
assertionInfo: AssertionInfoClass
assertionKey: AssertionKeyClass
assertionRunEvent: AssertionRunEventClass
browsePaths: BrowsePathsClass
browsePathsV2: BrowsePathsV2Class
chartInfo: ChartInfoClass
chartKey: ChartKeyClass
chartQuery: ChartQueryClass
chartUsageStatistics: ChartUsageStatisticsClass
container: ContainerClass
containerKey: ContainerKeyClass
containerProperties: ContainerPropertiesClass
corpGroupEditableInfo: CorpGroupEditableInfoClass
corpGroupInfo: CorpGroupInfoClass
corpGroupKey: CorpGroupKeyClass
corpUserCredentials: CorpUserCredentialsClass
corpUserEditableInfo: CorpUserEditableInfoClass
corpUserInfo: CorpUserInfoClass
corpUserKey: CorpUserKeyClass
corpUserSettings: CorpUserSettingsClass
corpUserStatus: CorpUserStatusClass
cost: CostClass
dashboardInfo: DashboardInfoClass
dashboardKey: DashboardKeyClass
dashboardUsageStatistics: DashboardUsageStatisticsClass
dataFlowInfo: DataFlowInfoClass
dataFlowKey: DataFlowKeyClass
dataHubAccessTokenInfo: DataHubAccessTokenInfoClass
dataHubAccessTokenKey: DataHubAccessTokenKeyClass
dataHubExecutionRequestInput: ExecutionRequestInputClass
dataHubExecutionRequestKey: ExecutionRequestKeyClass
dataHubExecutionRequestResult: ExecutionRequestResultClass
dataHubExecutionRequestSignal: ExecutionRequestSignalClass
dataHubIngestionSourceInfo: DataHubIngestionSourceInfoClass
dataHubIngestionSourceKey: DataHubIngestionSourceKeyClass
dataHubPolicyInfo: DataHubPolicyInfoClass
dataHubPolicyKey: DataHubPolicyKeyClass
dataHubRetentionConfig: DataHubRetentionConfigClass
dataHubRetentionKey: DataHubRetentionKeyClass
dataHubRoleInfo: DataHubRoleInfoClass
dataHubRoleKey: DataHubRoleKeyClass
dataHubSecretKey: DataHubSecretKeyClass
dataHubSecretValue: DataHubSecretValueClass
dataHubStepStateKey: DataHubStepStateKeyClass
dataHubStepStateProperties: DataHubStepStatePropertiesClass
dataHubUpgradeKey: DataHubUpgradeKeyClass
dataHubUpgradeRequest: DataHubUpgradeRequestClass
dataHubUpgradeResult: DataHubUpgradeResultClass
dataHubViewInfo: DataHubViewInfoClass
dataHubViewKey: DataHubViewKeyClass
dataJobInfo: DataJobInfoClass
dataJobInputOutput: DataJobInputOutputClass
dataJobKey: DataJobKeyClass
dataPlatformInfo: DataPlatformInfoClass
dataPlatformInstance: DataPlatformInstanceClass
dataPlatformInstanceKey: DataPlatformInstanceKeyClass
dataPlatformInstanceProperties: DataPlatformInstancePropertiesClass
dataPlatformKey: DataPlatformKeyClass
dataProcessInfo: DataProcessInfoClass
dataProcessInstanceInput: DataProcessInstanceInputClass
dataProcessInstanceKey: DataProcessInstanceKeyClass
dataProcessInstanceOutput: DataProcessInstanceOutputClass
dataProcessInstanceProperties: DataProcessInstancePropertiesClass
dataProcessInstanceRelationships: DataProcessInstanceRelationshipsClass
dataProcessInstanceRunEvent: DataProcessInstanceRunEventClass
dataProcessKey: DataProcessKeyClass
dataProductKey: DataProductKeyClass
dataProductProperties: DataProductPropertiesClass
datahubIngestionCheckpoint: DatahubIngestionCheckpointClass
datahubIngestionRunSummary: DatahubIngestionRunSummaryClass
datasetDeprecation: DatasetDeprecationClass
datasetKey: DatasetKeyClass
datasetProfile: DatasetProfileClass
datasetProperties: DatasetPropertiesClass
datasetUpstreamLineage: DatasetUpstreamLineageClass
datasetUsageStatistics: DatasetUsageStatisticsClass
deprecation: DeprecationClass
domainKey: DomainKeyClass
domainProperties: DomainPropertiesClass
domains: DomainsClass
editableChartProperties: EditableChartPropertiesClass
editableContainerProperties: EditableContainerPropertiesClass
editableDashboardProperties: EditableDashboardPropertiesClass
editableDataFlowProperties: EditableDataFlowPropertiesClass
editableDataJobProperties: EditableDataJobPropertiesClass
editableDatasetProperties: EditableDatasetPropertiesClass
editableMlFeatureProperties: EditableMLFeaturePropertiesClass
editableMlFeatureTableProperties: EditableMLFeatureTablePropertiesClass
editableMlModelGroupProperties: EditableMLModelGroupPropertiesClass
editableMlModelProperties: EditableMLModelPropertiesClass
editableMlPrimaryKeyProperties: EditableMLPrimaryKeyPropertiesClass
editableNotebookProperties: EditableNotebookPropertiesClass
editableSchemaMetadata: EditableSchemaMetadataClass
embed: EmbedClass
globalSettingsInfo: GlobalSettingsInfoClass
globalSettingsKey: GlobalSettingsKeyClass
globalTags: GlobalTagsClass
glossaryNodeInfo: GlossaryNodeInfoClass
glossaryNodeKey: GlossaryNodeKeyClass
glossaryRelatedTerms: GlossaryRelatedTermsClass
glossaryTermInfo: GlossaryTermInfoClass
glossaryTermKey: GlossaryTermKeyClass
glossaryTerms: GlossaryTermsClass
groupMembership: GroupMembershipClass
inputFields: InputFieldsClass
institutionalMemory: InstitutionalMemoryClass
intendedUse: IntendedUseClass
inviteToken: InviteTokenClass
inviteTokenKey: InviteTokenKeyClass
mlFeatureKey: MLFeatureKeyClass
mlFeatureProperties: MLFeaturePropertiesClass
mlFeatureTableKey: MLFeatureTableKeyClass
mlFeatureTableProperties: MLFeatureTablePropertiesClass
mlHyperParam: MLHyperParamClass
mlMetric: MLMetricClass
mlModelCaveatsAndRecommendations: CaveatsAndRecommendationsClass
mlModelDeploymentKey: MLModelDeploymentKeyClass
mlModelDeploymentProperties: MLModelDeploymentPropertiesClass
mlModelEthicalConsiderations: EthicalConsiderationsClass
mlModelEvaluationData: EvaluationDataClass
mlModelFactorPrompts: MLModelFactorPromptsClass
mlModelGroupKey: MLModelGroupKeyClass
mlModelGroupProperties: MLModelGroupPropertiesClass
mlModelKey: MLModelKeyClass
mlModelMetrics: MetricsClass
mlModelProperties: MLModelPropertiesClass
mlModelQuantitativeAnalyses: QuantitativeAnalysesClass
mlModelTrainingData: TrainingDataClass
mlPrimaryKeyKey: MLPrimaryKeyKeyClass
mlPrimaryKeyProperties: MLPrimaryKeyPropertiesClass
nativeGroupMembership: NativeGroupMembershipClass
notebookContent: NotebookContentClass
notebookInfo: NotebookInfoClass
notebookKey: NotebookKeyClass
operation: OperationClass
origin: OriginClass
ownership: OwnershipClass
ownershipTypeInfo: OwnershipTypeInfoClass
ownershipTypeKey: OwnershipTypeKeyClass
postInfo: PostInfoClass
postKey: PostKeyClass
queryKey: QueryKeyClass
queryProperties: QueryPropertiesClass
querySubjects: QuerySubjectsClass
roleKey: RoleKeyClass
roleMembership: RoleMembershipClass
roleProperties: RolePropertiesClass
schemaFieldKey: SchemaFieldKeyClass
schemaMetadata: SchemaMetadataClass
siblings: SiblingsClass
sourceCode: SourceCodeClass
status: StatusClass
subTypes: SubTypesClass
tagKey: TagKeyClass
tagProperties: TagPropertiesClass
telemetryClientId: TelemetryClientIdClass
telemetryKey: TelemetryKeyClass
testInfo: TestInfoClass
testKey: TestKeyClass
testResults: TestResultsClass
upstreamLineage: UpstreamLineageClass
versionInfo: VersionInfoClass
viewProperties: ViewPropertiesClass
class datahub.metadata.schema_classes.AssertionInfoClass(type, customProperties=None, externalUrl=None, datasetAssertion=None)

Bases: _Aspect

Information about an assertion

Parameters:
property customProperties: Dict[str, str]

Custom property bag.

property datasetAssertion: None | DatasetAssertionInfoClass

Dataset Assertion information when type is DATASET

property externalUrl: None | str

URL where the reference exist

property type: str | AssertionTypeClass

Type of assertion. Assertion types can evolve to span Datasets, Flows (Pipelines), Models, Features etc.

class datahub.metadata.schema_classes.AssertionKeyClass(assertionId)

Bases: _Aspect

Key for a Assertion

Parameters:

assertionId (str)

property assertionId: str

Unique id for the assertion.

class datahub.metadata.schema_classes.AssertionResultClass(type, rowCount=None, missingCount=None, unexpectedCount=None, actualAggValue=None, nativeResults=None, externalUrl=None)

Bases: DictWrapper

The result of running an assertion

Parameters:
  • type (Union[str, AssertionResultTypeClass])

  • rowCount (Optional[int])

  • missingCount (Optional[int])

  • unexpectedCount (Optional[int])

  • actualAggValue (Optional[float])

  • nativeResults (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

property actualAggValue: None | float

Observed aggregate value for evaluated batch

property externalUrl: None | str

URL where full results are available

property missingCount: None | int

Number of rows with missing value for evaluated batch

property nativeResults: None | Dict[str, str]

Other results of evaluation

property rowCount: None | int

Number of rows for evaluated batch

property type: str | AssertionResultTypeClass

The final result, e.g. either SUCCESS or FAILURE.

property unexpectedCount: None | int

Number of rows with unexpected value for evaluated batch

class datahub.metadata.schema_classes.AssertionResultTypeClass

Bases: object

The Assertion Succeeded

FAILURE = 'FAILURE'
SUCCESS = 'SUCCESS'

The Assertion Failed

class datahub.metadata.schema_classes.AssertionRunEventClass(timestampMillis, runId, assertionUrn, asserteeUrn, status, eventGranularity=None, partitionSpec=None, messageId=None, batchSpec=None, result=None, runtimeContext=None)

Bases: _Aspect

An event representing the current status of evaluating an assertion on a batch. AssertionRunEvent should be used for reporting the status of a run as an assertion evaluation progresses.

Parameters:
ASPECT_TYPE: ClassVar[str] = 'timeseries'
property asserteeUrn: str
property assertionUrn: str
property batchSpec: None | BatchSpecClass

Specification of the batch which this run is evaluating

property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property result: None | AssertionResultClass

Results of assertion, present if the status is COMPLETE

property runId: str

Native (platform-specific) identifier for this run

property runtimeContext: None | Dict[str, str]

Runtime parameters of evaluation

property status: str | AssertionRunStatusClass

The status of the assertion run as per this timeseries event.

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

class datahub.metadata.schema_classes.AssertionRunStatusClass

Bases: object

The Assertion Run has completed

COMPLETE = 'COMPLETE'
class datahub.metadata.schema_classes.AssertionStdAggregationClass

Bases: object

The function that is applied to the aggregation input (schema, rows, column values) before evaluating an operator.

COLUMNS = 'COLUMNS'

Assertion is applied on number of columns.

COLUMN_COUNT = 'COLUMN_COUNT'

Assertion is applied on individual column value.

IDENTITY = 'IDENTITY'

Assertion is applied on column mean

MAX = 'MAX'

Assertion is applied on column sum

MEAN = 'MEAN'

Assertion is applied on column median

MEDIAN = 'MEDIAN'

Assertion is applied on number of distinct values in column

MIN = 'MIN'

Assertion is applied on column std deviation

NULL_COUNT = 'NULL_COUNT'

Assertion is applied on proportion of null values in column

NULL_PROPORTION = 'NULL_PROPORTION'

Assertion is applied on column std deviation

ROW_COUNT = 'ROW_COUNT'

Assertion is applied on all columns.

STDDEV = 'STDDEV'

Assertion is applied on column min

SUM = 'SUM'

Other

UNIQUE_COUNT = 'UNIQUE_COUNT'

Assertion is applied on proportion of distinct values in column

UNIQUE_PROPOTION = 'UNIQUE_PROPOTION'

Assertion is applied on number of null values in column

class datahub.metadata.schema_classes.AssertionStdOperatorClass

Bases: object

A boolean operator that is applied on the input to an assertion, after an aggregation function has been applied.

BETWEEN = 'BETWEEN'

Value being asserted is less than a max value. Requires ‘value’ parameter.

CONTAIN = 'CONTAIN'

Value being asserted ends with value. Requires ‘value’ parameter.

END_WITH = 'END_WITH'

Value being asserted starts with value. Requires ‘value’ parameter.

EQUAL_TO = 'EQUAL_TO'

Value being asserted is not null. Requires no parameters.

GREATER_THAN = 'GREATER_THAN'

Value being asserted is greater than or equal to some value. Requires ‘value’ parameter.

GREATER_THAN_OR_EQUAL_TO = 'GREATER_THAN_OR_EQUAL_TO'

Value being asserted is equal to value. Requires ‘value’ parameter.

IN = 'IN'

Value being asserted is not in one of the array values. Requires ‘value’ parameter.

LESS_THAN = 'LESS_THAN'

Value being asserted is less than or equal to some value. Requires ‘value’ parameter.

LESS_THAN_OR_EQUAL_TO = 'LESS_THAN_OR_EQUAL_TO'

Value being asserted is greater than some value. Requires ‘value’ parameter.

NOT_IN = 'NOT_IN'

Other

NOT_NULL = 'NOT_NULL'

Value being asserted contains value. Requires ‘value’ parameter.

REGEX_MATCH = 'REGEX_MATCH'

Value being asserted is one of the array values. Requires ‘value’ parameter.

START_WITH = 'START_WITH'

Value being asserted matches the regex value. Requires ‘value’ parameter.

class datahub.metadata.schema_classes.AssertionStdParameterClass(value, type)

Bases: DictWrapper

Single parameter for AssertionStdOperators.

Parameters:
property type: str | AssertionStdParameterTypeClass

The type of the parameter

property value: str

The parameter value

class datahub.metadata.schema_classes.AssertionStdParameterTypeClass

Bases: object

LIST = 'LIST'
NUMBER = 'NUMBER'
SET = 'SET'
STRING = 'STRING'
UNKNOWN = 'UNKNOWN'
class datahub.metadata.schema_classes.AssertionStdParametersClass(value=None, maxValue=None, minValue=None)

Bases: DictWrapper

Parameters for AssertionStdOperators.

Parameters:
property maxValue: None | AssertionStdParameterClass

The maxValue parameter of an assertion

property minValue: None | AssertionStdParameterClass

The minValue parameter of an assertion

property value: None | AssertionStdParameterClass

The value parameter of an assertion

class datahub.metadata.schema_classes.AssertionTypeClass

Bases: object

DATASET = 'DATASET'
class datahub.metadata.schema_classes.AuditStampClass(time, actor, impersonator=None, message=None)

Bases: DictWrapper

Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage.

Parameters:
  • time (int)

  • actor (str)

  • impersonator (Optional[str])

  • message (Optional[str])

property actor: str

The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.

property impersonator: None | str

The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.

property message: None | str

was the change created by an automated process, or manually.

Type:

Additional context around how DataHub was informed of the particular change. For example

property time: int

When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent.

class datahub.metadata.schema_classes.AzkabanJobTypeClass

Bases: object

The various types of support azkaban jobs

COMMAND = 'COMMAND'

Runs a java program with ability to access Hadoop cluster. https://azkaban.readthedocs.io/en/latest/jobTypes.html#java-job-type

GLUE = 'GLUE'
HADOOP_JAVA = 'HADOOP_JAVA'

In large part, this is the same Command type. The difference is its ability to talk to a Hadoop cluster securely, via Hadoop tokens.

HADOOP_SHELL = 'HADOOP_SHELL'

Hive type is for running Hive jobs.

HIVE = 'HIVE'

Pig type is for running Pig jobs.

PIG = 'PIG'

SQL is for running Presto, mysql queries etc

SQL = 'SQL'

Glue type is for running AWS Glue job transforms.

class datahub.metadata.schema_classes.BaseDataClass(dataset, motivation=None, preProcessing=None)

Bases: DictWrapper

BaseData record

Parameters:
  • dataset (str)

  • motivation (Optional[str])

  • preProcessing (Optional[List[str]])

property dataset: str

What dataset were used in the MLModel?

property motivation: None | str

Why was this dataset chosen?

property preProcessing: None | List[str]

How was the data preprocessed (e.g., tokenization of sentences, cropping of images, any filtering such as dropping images without faces)?

class datahub.metadata.schema_classes.BatchSpecClass(customProperties=None, nativeBatchId=None, query=None, limit=None)

Bases: DictWrapper

A batch on which certain operations, e.g. data quality evaluation, is done.

Parameters:
  • customProperties (Optional[Dict[str, str]])

  • nativeBatchId (Optional[str])

  • query (Optional[str])

  • limit (Optional[int])

property customProperties: Dict[str, str]

Custom property bag.

property limit: None | int

Any limit to the number of rows in the batch, if applied

property nativeBatchId: None | str

The native identifier as specified by the system operating on the batch.

property query: None | str

A query that identifies a batch of data

class datahub.metadata.schema_classes.BinaryJsonSchemaClass(schema)

Bases: DictWrapper

Schema text of binary JSON schema.

Parameters:

schema (str)

property schema: str

The native schema text for binary JSON file format.

class datahub.metadata.schema_classes.BooleanTypeClass

Bases: DictWrapper

Boolean field type.

class datahub.metadata.schema_classes.BrowsePathEntryClass(id, urn=None)

Bases: DictWrapper

Represents a single level in an entity’s browsePathV2

Parameters:
  • id (str)

  • urn (Optional[str])

property id: str

The ID of the browse path entry. This is what gets stored in the index. If there’s an urn associated with this entry, id and urn will be the same

property urn: None | str

Optional urn pointing to some entity in DataHub

class datahub.metadata.schema_classes.BrowsePathsClass(paths)

Bases: _Aspect

Shared aspect containing Browse Paths to be indexed for an entity.

Parameters:

paths (List[str])

property paths: List[str]

A list of valid browse paths for the entity.

Browse paths are expected to be forward slash-separated strings. For example: ‘prod/snowflake/datasetName’

class datahub.metadata.schema_classes.BrowsePathsV2Class(path)

Bases: _Aspect

Shared aspect containing a Browse Path to be indexed for an entity.

Parameters:

path (List[BrowsePathEntryClass])

property path: List[BrowsePathEntryClass]

A valid browse path for the entity. This field is provided by DataHub by default. This aspect is a newer version of browsePaths where we can encode more information in the path. This path is also based on containers for a given entity if it has containers.

This is stored in elasticsearch as unit-separator delimited strings and only includes platform specific folders or containers. These paths should not include high level info captured elsewhere ie. Platform and Environment.

class datahub.metadata.schema_classes.BytesTypeClass

Bases: DictWrapper

Bytes field type.

class datahub.metadata.schema_classes.CalendarIntervalClass

Bases: object

DAY = 'DAY'
HOUR = 'HOUR'
MINUTE = 'MINUTE'
MONTH = 'MONTH'
QUARTER = 'QUARTER'
SECOND = 'SECOND'
WEEK = 'WEEK'
YEAR = 'YEAR'
class datahub.metadata.schema_classes.CaveatDetailsClass(needsFurtherTesting=None, caveatDescription=None, groupsNotRepresented=None)

Bases: DictWrapper

This section should list additional concerns that were not covered in the previous sections. For example, did the results suggest any further testing? Were there any relevant groups that were not represented in the evaluation dataset? Are there additional recommendations for model use?

Parameters:
  • needsFurtherTesting (Optional[bool])

  • caveatDescription (Optional[str])

  • groupsNotRepresented (Optional[List[str]])

property caveatDescription: None | str

Caveat Description For ex: Given gender classes are binary (male/not male), which we include as male/female. Further work needed to evaluate across a spectrum of genders.

property groupsNotRepresented: None | List[str]

Relevant groups that were not represented in the evaluation dataset?

property needsFurtherTesting: None | bool

Did the results suggest any further testing?

class datahub.metadata.schema_classes.CaveatsAndRecommendationsClass(caveats=None, recommendations=None, idealDatasetCharacteristics=None)

Bases: _Aspect

This section should list additional concerns that were not covered in the previous sections. For example, did the results suggest any further testing? Were there any relevant groups that were not represented in the evaluation dataset? Are there additional recommendations for model use?

Parameters:
  • caveats (Optional[CaveatDetailsClass])

  • recommendations (Optional[str])

  • idealDatasetCharacteristics (Optional[List[str]])

property caveats: None | CaveatDetailsClass

This section should list additional concerns that were not covered in the previous sections. For example, did the results suggest any further testing? Were there any relevant groups that were not represented in the evaluation dataset?

property idealDatasetCharacteristics: None | List[str]

Ideal characteristics of an evaluation dataset for this MLModel

property recommendations: None | str

Recommendations on where this MLModel should be used.

class datahub.metadata.schema_classes.ChangeAuditStampsClass(created=None, lastModified=None, deleted=None)

Bases: DictWrapper

Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into various lifecycle stages, and who acted to move it into those lifecycle stages. The recommended best practice is to include this record in your record schema, and annotate its fields as @readOnly in your resource. See linkedin/rest.li

Parameters:
property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

class datahub.metadata.schema_classes.ChangeTypeClass

Bases: object

Descriptor for a change action

CREATE = 'CREATE'

NOT SUPPORTED YET update if exists. otherwise fail

DELETE = 'DELETE'

NOT SUPPORTED YET patch the changes instead of full replace

PATCH = 'PATCH'

Restate an aspect, eg. in a index refresh.

RESTATE = 'RESTATE'
UPDATE = 'UPDATE'

NOT SUPPORTED YET delete action

UPSERT = 'UPSERT'

NOT SUPPORTED YET insert if not exists. otherwise fail

class datahub.metadata.schema_classes.ChartCellClass(cellId, changeAuditStamps, cellTitle=None)

Bases: DictWrapper

Chart cell in a notebook, which will present content in chart format

Parameters:
property cellId: str

Unique id for the cell. This id should be globally unique for a Notebook tool even when there are multiple deployments of it. As an example, Notebook URL could be used here for QueryBook such as ‘querybook.com/notebook/773/?cellId=1234’

property cellTitle: None | str

Title of the cell

property changeAuditStamps: ChangeAuditStampsClass

Captures information about who created/last modified/deleted this Notebook cell and when

class datahub.metadata.schema_classes.ChartInfoClass(title, description, lastModified, customProperties=None, externalUrl=None, chartUrl=None, inputs=None, inputEdges=None, type=None, access=None, lastRefreshed=None)

Bases: _Aspect

Information about a chart

Parameters:
  • title (str)

  • description (str)

  • lastModified (ChangeAuditStampsClass)

  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • chartUrl (Optional[str])

  • inputs (Optional[List[str]])

  • inputEdges (Optional[List[EdgeClass]])

  • type (Union[None, str, ChartTypeClass])

  • access (Union[None, str, AccessLevelClass])

  • lastRefreshed (Optional[int])

property access: None | str | AccessLevelClass

Access level for the chart

property chartUrl: None | str

URL for the chart. This could be used as an external link on DataHub to allow users access/view the chart

property customProperties: Dict[str, str]

Custom property bag.

property description: str

Detailed description about the chart

property externalUrl: None | str

URL where the reference exist

property inputEdges: None | List[EdgeClass]

Data sources for the chart

property inputs: None | List[str]

Data sources for the chart Deprecated! Use inputEdges instead.

property lastModified: ChangeAuditStampsClass

Captures information about who created/last modified/deleted this chart and when

property lastRefreshed: None | int

The time when this chart last refreshed

property title: str

Title of the chart

property type: None | str | ChartTypeClass

Type of the chart

class datahub.metadata.schema_classes.ChartKeyClass(dashboardTool, chartId)

Bases: _Aspect

Key for a Chart

Parameters:
  • dashboardTool (str)

  • chartId (str)

property chartId: str

Unique id for the chart. This id should be globally unique for a dashboarding tool even when there are multiple deployments of it. As an example, chart URL could be used here for Looker such as ‘looker.linkedin.com/looks/1234’

property dashboardTool: str

The name of the dashboard tool such as looker, redash etc.

class datahub.metadata.schema_classes.ChartQueryClass(rawQuery, type)

Bases: _Aspect

Information for chart query which is used for getting data of the chart

Parameters:
property rawQuery: str

Raw query to build a chart from input datasets

property type: str | ChartQueryTypeClass

Chart query type

class datahub.metadata.schema_classes.ChartQueryTypeClass

Bases: object

LookML queries

LOOKML = 'LOOKML'

SQL type queries

SQL = 'SQL'
class datahub.metadata.schema_classes.ChartSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific Chart entity.

Parameters:
property aspects: List[ChartKeyClass | ChartInfoClass | ChartQueryClass | EditableChartPropertiesClass | OwnershipClass | StatusClass | GlobalTagsClass | BrowsePathsClass | GlossaryTermsClass | InstitutionalMemoryClass | DataPlatformInstanceClass | BrowsePathsV2Class]

The list of metadata aspects associated with the chart. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.ChartTypeClass

Bases: object

The various types of charts

AREA = 'AREA'
BAR = 'BAR'

Chart showing a Pie chart

BOX_PLOT = 'BOX_PLOT'
COHORT = 'COHORT'
HISTOGRAM = 'HISTOGRAM'
LINE = 'LINE'
PIE = 'PIE'

Chart showing a Scatter plot

SCATTER = 'SCATTER'

Chart showing a table

TABLE = 'TABLE'

Chart showing Markdown formatted text

TEXT = 'TEXT'
WORD_CLOUD = 'WORD_CLOUD'
class datahub.metadata.schema_classes.ChartUsageStatisticsClass(timestampMillis, eventGranularity=None, partitionSpec=None, messageId=None, viewsCount=None, uniqueUserCount=None, userCounts=None)

Bases: _Aspect

Experimental (Subject to breaking change) – Stats corresponding to chart’s usage.

If this aspect represents the latest snapshot of the statistics about a Chart, the eventGranularity field should be null. If this aspect represents a bucketed window of usage statistics (e.g. over a day), then the eventGranularity field should be set accordingly.

Parameters:
ASPECT_TYPE: ClassVar[str] = 'timeseries'
property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

property uniqueUserCount: None | int

Unique user count

property userCounts: None | List[ChartUserUsageCountsClass]

Users within this bucket, with frequency counts

property viewsCount: None | int

The total number of times chart has been viewed

class datahub.metadata.schema_classes.ChartUserUsageCountsClass(user, viewsCount=None)

Bases: DictWrapper

Records a single user’s usage counts for a given resource

Parameters:
  • user (str)

  • viewsCount (Optional[int])

property user: str

The unique id of the user.

property viewsCount: None | int

The number of times the user has viewed the chart

class datahub.metadata.schema_classes.ConditionClass

Bases: object

The matching condition in a filter criterion

CONTAIN = 'CONTAIN'

String field ends with value, e.g. name ends with Event

Type:

Represent the relation

END_WITH = 'END_WITH'

field = value, e.g. platform = hdfs

Type:

Represent the relation

EQUAL = 'EQUAL'

field is null, e.g. platform is null

Type:

Represent the relation

EXISTS = 'EXISTS'

Represent the relation greater than, e.g. ownerCount > 5

GREATER_THAN = 'GREATER_THAN'

Represent the relation greater than or equal to, e.g. ownerCount >= 5

GREATER_THAN_OR_EQUAL_TO = 'GREATER_THAN_OR_EQUAL_TO'

String field is one of the array values to, e.g. name in [“Profile”, “Event”]

Type:

Represent the relation

IN = 'IN'

Represent the relation less than, e.g. ownerCount < 3

IS_NULL = 'IS_NULL'

field exists and is non-empty, e.g. owners is not null and != [] (empty)

Type:

Represents the relation

LESS_THAN = 'LESS_THAN'

Represent the relation less than or equal to, e.g. ownerCount <= 3

LESS_THAN_OR_EQUAL_TO = 'LESS_THAN_OR_EQUAL_TO'

String field starts with value, e.g. name starts with PageView

Type:

Represent the relation

START_WITH = 'START_WITH'
class datahub.metadata.schema_classes.ConjunctiveCriterionClass(and_)

Bases: DictWrapper

A list of criterion and’d together.

Parameters:

and_ (List[CriterionClass])

property and_: List[CriterionClass]

A list of and criteria the filter applies to the query

class datahub.metadata.schema_classes.ContainerClass(container)

Bases: _Aspect

Link from an asset to its parent container

Parameters:

container (str)

property container: str

The parent container of an asset

class datahub.metadata.schema_classes.ContainerKeyClass(guid=None)

Bases: _Aspect

Key for an Asset Container

Parameters:

guid (Optional[str])

property guid: None | str

Unique guid for container

class datahub.metadata.schema_classes.ContainerPropertiesClass(name, customProperties=None, externalUrl=None, qualifiedName=None, description=None, created=None, lastModified=None)

Bases: _Aspect

Information about a Asset Container as received from a 3rd party source system

Parameters:
  • name (str)

  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • qualifiedName (Optional[str])

  • description (Optional[str])

  • created (Optional[TimeStampClass])

  • lastModified (Optional[TimeStampClass])

property created: None | TimeStampClass

A timestamp documenting when the asset was created in the source Data Platform (not on DataHub)

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Description of the Asset Container as it exists inside a source system

property externalUrl: None | str

URL where the reference exist

property lastModified: None | TimeStampClass

A timestamp documenting when the asset was last modified in the source Data Platform (not on DataHub)

property name: str

Display name of the Asset Container

property qualifiedName: None | str

Fully-qualified name of the Container

class datahub.metadata.schema_classes.CorpGroupEditableInfoClass(description=None, pictureLink=None, slack=None, email=None)

Bases: _Aspect

Group information that can be edited from UI

Parameters:
  • description (Optional[str])

  • pictureLink (Optional[str])

  • slack (Optional[str])

  • email (Optional[str])

property description: None | str

A description of the group

property email: None | str

Email address to contact the group

A URL which points to a picture which user wants to set as the photo for the group

property slack: None | str

Slack channel for the group

class datahub.metadata.schema_classes.CorpGroupInfoClass(admins, members, groups, displayName=None, email=None, description=None, slack=None, created=None)

Bases: _Aspect

Information about a Corp Group ingested from a third party source

Parameters:
  • admins (List[str])

  • members (List[str])

  • groups (List[str])

  • displayName (Optional[str])

  • email (Optional[str])

  • description (Optional[str])

  • slack (Optional[str])

  • created (Optional[AuditStampClass])

property admins: List[str]

owners of this group Deprecated! Replaced by Ownership aspect.

property created: None | AuditStampClass

Created Audit stamp

property description: None | str

A description of the group.

property displayName: None | str

The name of the group.

property email: None | str

email of this group

property groups: List[str]

List of groups in this group. Deprecated! This field is unused.

property members: List[str]

List of ldap urn in this group. Deprecated! Replaced by GroupMembership aspect.

property slack: None | str

Slack channel for the group

class datahub.metadata.schema_classes.CorpGroupKeyClass(name)

Bases: _Aspect

Key for a CorpGroup

Parameters:

name (str)

property name: str

The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub.

class datahub.metadata.schema_classes.CorpGroupSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific CorpGroup entity.

Parameters:
property aspects: List[CorpGroupKeyClass | CorpGroupInfoClass | GlobalTagsClass | StatusClass]

The list of metadata aspects associated with the LdapUser. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.CorpUserAppearanceSettingsClass(showSimplifiedHomepage=None)

Bases: DictWrapper

Settings for a user around the appearance of their DataHub UI

Parameters:

showSimplifiedHomepage (Optional[bool])

property showSimplifiedHomepage: None | bool

Flag whether the user should see a homepage with only datasets, charts and dashboards. Intended for users who have less operational use cases for the datahub tool.

class datahub.metadata.schema_classes.CorpUserCredentialsClass(salt, hashedPassword, passwordResetToken=None, passwordResetTokenExpirationTimeMillis=None)

Bases: _Aspect

Corp user credentials

Parameters:
  • salt (str)

  • hashedPassword (str)

  • passwordResetToken (Optional[str])

  • passwordResetTokenExpirationTimeMillis (Optional[int])

property hashedPassword: str

Hashed password generated by concatenating salt and password, then hashing

property passwordResetToken: None | str

Optional token needed to reset a user’s password. Can only be set by the admin.

property passwordResetTokenExpirationTimeMillis: None | int

When the password reset token expires.

property salt: str

Salt used to hash password

class datahub.metadata.schema_classes.CorpUserEditableInfoClass(aboutMe=None, teams=None, skills=None, pictureLink=None, displayName=None, title=None, slack=None, phone=None, email=None)

Bases: _Aspect

Linkedin corp user information that can be edited from UI

Parameters:
  • aboutMe (Optional[str])

  • teams (Optional[List[str]])

  • skills (Optional[List[str]])

  • pictureLink (Optional[str])

  • displayName (Optional[str])

  • title (Optional[str])

  • slack (Optional[str])

  • phone (Optional[str])

  • email (Optional[str])

property aboutMe: None | str

About me section of the user

property displayName: None | str

DataHub-native display name

property email: None | str

Email address to contact the user

property phone: None | str

Phone number to contact the user

A URL which points to a picture which user wants to set as a profile photo

property skills: List[str]

Skills that the user possesses e.g. Machine Learning

property slack: None | str

Slack handle for the user

property teams: List[str]

Teams that the user belongs to e.g. Metadata

property title: None | str

DataHub-native Title, e.g. ‘Software Engineer’

class datahub.metadata.schema_classes.CorpUserInfoClass(active, customProperties=None, displayName=None, email=None, title=None, managerUrn=None, departmentId=None, departmentName=None, firstName=None, lastName=None, fullName=None, countryCode=None)

Bases: _Aspect

Linkedin corp user information

Parameters:
  • active (bool)

  • customProperties (Optional[Dict[str, str]])

  • displayName (Optional[str])

  • email (Optional[str])

  • title (Optional[str])

  • managerUrn (Optional[str])

  • departmentId (Optional[int])

  • departmentName (Optional[str])

  • firstName (Optional[str])

  • lastName (Optional[str])

  • fullName (Optional[str])

  • countryCode (Optional[str])

property active: bool

//iwww.corp.linkedin.com/wiki/cf/display/GTSD/Accessing+Active+Directory+via+LDAP+tools

Type:

Deprecated! Use CorpUserStatus instead. Whether the corpUser is active, ref

Type:

https

property countryCode: None | str

two uppercase letters country code. e.g. US

property customProperties: Dict[str, str]

Custom property bag.

property departmentId: None | int

department id this user belong to

property departmentName: None | str

department name this user belong to

property displayName: None | str

displayName of this user , e.g. Hang Zhang(DataHQ)

property email: None | str

email address of this user

property firstName: None | str

first name of this user

property fullName: None | str

Common name of this user, format is firstName + lastName (split by a whitespace)

property lastName: None | str

last name of this user

property managerUrn: None | str

direct manager of this user

property title: None | str

title of this user

class datahub.metadata.schema_classes.CorpUserKeyClass(username)

Bases: _Aspect

Key for a CorpUser

Parameters:

username (str)

property username: str

The name of the AD/LDAP user.

class datahub.metadata.schema_classes.CorpUserSettingsClass(appearance, views=None)

Bases: _Aspect

Settings that a user can customize through the datahub ui

Parameters:
property appearance: CorpUserAppearanceSettingsClass

Settings for a user around the appearance of their DataHub U

property views: None | CorpUserViewsSettingsClass

User preferences for the Views feature.

class datahub.metadata.schema_classes.CorpUserSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific CorpUser entity.

Parameters:
property aspects: List[CorpUserKeyClass | CorpUserInfoClass | CorpUserEditableInfoClass | CorpUserStatusClass | GroupMembershipClass | GlobalTagsClass | StatusClass]

The list of metadata aspects associated with the CorpUser. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.CorpUserStatusClass(status, lastModified)

Bases: _Aspect

The status of the user, e.g. provisioned, active, suspended, etc.

Parameters:
property lastModified: AuditStampClass

Audit stamp containing who last modified the status and when.

property status: str

Status of the user, e.g. PROVISIONED / ACTIVE / SUSPENDED

class datahub.metadata.schema_classes.CorpUserViewsSettingsClass(defaultView=None)

Bases: DictWrapper

Settings related to the ‘Views’ feature.

Parameters:

defaultView (Optional[str])

property defaultView: None | str

The default View which is selected for the user. If none is chosen, then this value will be left blank.

class datahub.metadata.schema_classes.CostClass(costType, cost)

Bases: _Aspect

Parameters:
property cost: CostCostClass
property costType: str | CostTypeClass
class datahub.metadata.schema_classes.CostCostClass(fieldDiscriminator, costId=None, costCode=None)

Bases: DictWrapper

Parameters:
property costCode: None | str
property costId: None | float
property fieldDiscriminator: str | CostCostDiscriminatorClass

Contains the name of the field that has its value set.

class datahub.metadata.schema_classes.CostCostDiscriminatorClass

Bases: object

costCode = 'costCode'
costId = 'costId'
class datahub.metadata.schema_classes.CostTypeClass

Bases: object

Type of Cost Code

ORG_COST_TYPE = 'ORG_COST_TYPE'
class datahub.metadata.schema_classes.CriterionClass(field, value, values=None, condition=None, negated=None)

Bases: DictWrapper

A criterion for matching a field with given value

Parameters:
  • field (str)

  • value (str)

  • values (Optional[List[str]])

  • condition (Union[str, ConditionClass, None])

  • negated (Optional[bool])

property condition: str | ConditionClass

The condition for the criterion, e.g. EQUAL, START_WITH

property field: str

The name of the field that the criterion refers to

property negated: bool

Whether the condition should be negated

property value: str

The value of the intended field

property values: List[str]

Values. one of which the intended field should match Note, if values is set, the above “value” field will be ignored

class datahub.metadata.schema_classes.DashboardInfoClass(title, description, lastModified, customProperties=None, externalUrl=None, charts=None, chartEdges=None, datasets=None, datasetEdges=None, dashboardUrl=None, access=None, lastRefreshed=None)

Bases: _Aspect

Information about a dashboard

Parameters:
  • title (str)

  • description (str)

  • lastModified (ChangeAuditStampsClass)

  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • charts (Optional[List[str]])

  • chartEdges (Optional[List[EdgeClass]])

  • datasets (Optional[List[str]])

  • datasetEdges (Optional[List[EdgeClass]])

  • dashboardUrl (Optional[str])

  • access (Union[None, str, AccessLevelClass])

  • lastRefreshed (Optional[int])

property access: None | str | AccessLevelClass

Access level for the dashboard

property chartEdges: None | List[EdgeClass]

Charts in a dashboard

property charts: List[str]

Charts in a dashboard Deprecated! Use chartEdges instead.

property customProperties: Dict[str, str]

Custom property bag.

property dashboardUrl: None | str

URL for the dashboard. This could be used as an external link on DataHub to allow users access/view the dashboard

property datasetEdges: None | List[EdgeClass]

Datasets consumed by a dashboard

property datasets: List[str]

Datasets consumed by a dashboard Deprecated! Use datasetEdges instead.

property description: str

Detailed description about the dashboard

property externalUrl: None | str

URL where the reference exist

property lastModified: ChangeAuditStampsClass

Captures information about who created/last modified/deleted this dashboard and when

property lastRefreshed: None | int

The time when this dashboard last refreshed

property title: str

Title of the dashboard

class datahub.metadata.schema_classes.DashboardKeyClass(dashboardTool, dashboardId)

Bases: _Aspect

Key for a Dashboard

Parameters:
  • dashboardTool (str)

  • dashboardId (str)

property dashboardId: str

Unique id for the dashboard. This id should be globally unique for a dashboarding tool even when there are multiple deployments of it. As an example, dashboard URL could be used here for Looker such as ‘looker.linkedin.com/dashboards/1234’

property dashboardTool: str

The name of the dashboard tool such as looker, redash etc.

class datahub.metadata.schema_classes.DashboardSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific Dashboard entity.

Parameters:
property aspects: List[DashboardKeyClass | DashboardInfoClass | EditableDashboardPropertiesClass | OwnershipClass | StatusClass | GlobalTagsClass | BrowsePathsClass | GlossaryTermsClass | InstitutionalMemoryClass | DataPlatformInstanceClass | BrowsePathsV2Class]

The list of metadata aspects associated with the dashboard. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.DashboardUsageStatisticsClass(timestampMillis, eventGranularity=None, partitionSpec=None, messageId=None, viewsCount=None, executionsCount=None, uniqueUserCount=None, userCounts=None, favoritesCount=None, lastViewedAt=None)

Bases: _Aspect

Experimental (Subject to breaking change) – Stats corresponding to dashboard’s usage.

If this aspect represents the latest snapshot of the statistics about a Dashboard, the eventGranularity field should be null. If this aspect represents a bucketed window of usage statistics (e.g. over a day), then the eventGranularity field should be set accordingly.

Parameters:
  • timestampMillis (int)

  • eventGranularity (Optional[TimeWindowSizeClass])

  • partitionSpec (Optional[PartitionSpecClass])

  • messageId (Optional[str])

  • viewsCount (Optional[int])

  • executionsCount (Optional[int])

  • uniqueUserCount (Optional[int])

  • userCounts (Optional[List[DashboardUserUsageCountsClass]])

  • favoritesCount (Optional[int])

  • lastViewedAt (Optional[int])

ASPECT_TYPE: ClassVar[str] = 'timeseries'
property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property executionsCount: None | int

The total number of dashboard executions (refreshes / syncs)

property favoritesCount: None | int

The total number of times that the dashboard has been favorited

property lastViewedAt: None | int

Last viewed at

This should not be set in cases where statistics are windowed.

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

property uniqueUserCount: None | int

Unique user count

property userCounts: None | List[DashboardUserUsageCountsClass]

Users within this bucket, with frequency counts

property viewsCount: None | int

The total number of times dashboard has been viewed

class datahub.metadata.schema_classes.DashboardUserUsageCountsClass(user, viewsCount=None, executionsCount=None, usageCount=None, userEmail=None)

Bases: DictWrapper

Records a single user’s usage counts for a given resource

Parameters:
  • user (str)

  • viewsCount (Optional[int])

  • executionsCount (Optional[int])

  • usageCount (Optional[int])

  • userEmail (Optional[str])

property executionsCount: None | int

The number of times the user has executed (refreshed) the dashboard

property usageCount: None | int

Normalized numeric metric representing user’s dashboard usage – the number of times the user executed or viewed the dashboard.

property user: str

The unique id of the user.

property userEmail: None | str

If user_email is set, we attempt to resolve the user’s urn upon ingest

property viewsCount: None | int

The number of times the user has viewed the dashboard

class datahub.metadata.schema_classes.DataFlowInfoClass(name, customProperties=None, externalUrl=None, description=None, project=None, created=None, lastModified=None)

Bases: _Aspect

Information about a Data processing flow

Parameters:
  • name (str)

  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • description (Optional[str])

  • project (Optional[str])

  • created (Optional[TimeStampClass])

  • lastModified (Optional[TimeStampClass])

property created: None | TimeStampClass

A timestamp documenting when the asset was created in the source Data Platform (not on DataHub)

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Flow description

property externalUrl: None | str

URL where the reference exist

property lastModified: None | TimeStampClass

A timestamp documenting when the asset was last modified in the source Data Platform (not on DataHub)

property name: str

Flow name

property project: None | str

Optional project/namespace associated with the flow

class datahub.metadata.schema_classes.DataFlowKeyClass(orchestrator, flowId, cluster)

Bases: _Aspect

Key for a Data Flow

Parameters:
  • orchestrator (str)

  • flowId (str)

  • cluster (str)

property cluster: str

Cluster where the flow is executed

property flowId: str

Unique Identifier of the data flow

property orchestrator: str

Workflow manager like azkaban, airflow which orchestrates the flow

class datahub.metadata.schema_classes.DataFlowSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific DataFlow entity.

Parameters:
property aspects: List[DataFlowKeyClass | DataFlowInfoClass | EditableDataFlowPropertiesClass | OwnershipClass | StatusClass | GlobalTagsClass | BrowsePathsClass | GlossaryTermsClass | InstitutionalMemoryClass | DataPlatformInstanceClass | BrowsePathsV2Class]

The list of metadata aspects associated with the data flow. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.DataHubAccessTokenInfoClass(name, actorUrn, ownerUrn, createdAt, expiresAt=None, description=None)

Bases: _Aspect

Information about a DataHub Access Token

Parameters:
  • name (str)

  • actorUrn (str)

  • ownerUrn (str)

  • createdAt (int)

  • expiresAt (Optional[int])

  • description (Optional[str])

property actorUrn: str

Urn of the actor to which this access token belongs to.

property createdAt: int

When the token was created.

property description: None | str

Description of the token if defined.

property expiresAt: None | int

When the token expires.

property name: str

User defined name for the access token if defined.

property ownerUrn: str

Urn of the actor which created this access token.

class datahub.metadata.schema_classes.DataHubAccessTokenKeyClass(id)

Bases: _Aspect

Key for a DataHub Access Token

Parameters:

id (str)

property id: str

Access token’s SHA-256 hashed JWT signature

class datahub.metadata.schema_classes.DataHubActorFilterClass(users=None, groups=None, resourceOwners=None, resourceOwnersTypes=None, allUsers=None, allGroups=None, roles=None)

Bases: DictWrapper

Information used to filter DataHub actors.

Parameters:
  • users (Optional[List[str]])

  • groups (Optional[List[str]])

  • resourceOwners (Optional[bool])

  • resourceOwnersTypes (Optional[List[str]])

  • allUsers (Optional[bool])

  • allGroups (Optional[bool])

  • roles (Optional[List[str]])

property allGroups: bool

Whether the filter should apply to all groups.

property allUsers: bool

Whether the filter should apply to all users.

property groups: None | List[str]

A specific set of groups to apply the policy to (disjunctive)

property resourceOwners: bool

Whether the filter should return true for owners of a particular resource. Only applies to policies of type ‘Metadata’, which have a resource associated with them.

property resourceOwnersTypes: None | List[str]

Define type of ownership for the policy

property roles: None | List[str]

A specific set of roles to apply the policy to (disjunctive).

property users: None | List[str]

A specific set of users to apply the policy to (disjunctive)

class datahub.metadata.schema_classes.DataHubIngestionSourceConfigClass(recipe, version=None, executorId=None, debugMode=None)

Bases: DictWrapper

Parameters:
  • recipe (str)

  • version (Optional[str])

  • executorId (Optional[str])

  • debugMode (Optional[bool])

property debugMode: None | bool

Whether or not to run this ingestion source in debug mode

property executorId: None | str

The id of the executor to use to execute the ingestion run

property recipe: str

The JSON recipe to use for ingestion

property version: None | str

The PyPI version of the datahub CLI to use when executing a recipe

class datahub.metadata.schema_classes.DataHubIngestionSourceInfoClass(name, type, config, platform=None, schedule=None)

Bases: _Aspect

Info about a DataHub ingestion source

Parameters:
property config: DataHubIngestionSourceConfigClass

Parameters associated with the Ingestion Source

property name: str

The display name of the ingestion source

property platform: None | str

Data Platform URN associated with the source

property schedule: None | DataHubIngestionSourceScheduleClass

The schedule on which the ingestion source is executed

property type: str

The type of the source itself, e.g. mysql, bigquery, bigquery-usage. Should match the recipe.

class datahub.metadata.schema_classes.DataHubIngestionSourceKeyClass(id)

Bases: _Aspect

Key for a DataHub ingestion source

Parameters:

id (str)

property id: str

A unique id for the Ingestion Source, either generated or provided

class datahub.metadata.schema_classes.DataHubIngestionSourceScheduleClass(interval, timezone)

Bases: DictWrapper

The schedule associated with an ingestion source.

Parameters:
  • interval (str)

  • timezone (str)

property interval: str

A cron-formatted execution interval, as a cron string, e.g. * * * * *

property timezone: str

Timezone in which the cron interval applies, e.g. America/Los Angeles

class datahub.metadata.schema_classes.DataHubPolicyInfoClass(displayName, description, type, state, privileges, actors, resources=None, editable=None, lastUpdatedTimestamp=None)

Bases: _Aspect

Information about a DataHub (UI) access policy.

Parameters:
property actors: DataHubActorFilterClass

The actors that the policy applies to.

property description: str

Description of the Policy

property displayName: str

Display name of the Policy

property editable: bool

Whether the policy should be editable via the UI

property lastUpdatedTimestamp: None | int

Timestamp when the policy was last updated

property privileges: List[str]

The privileges that the policy grants.

property resources: None | DataHubResourceFilterClass

The resource that the policy applies to. Not required for some ‘Platform’ privileges.

property state: str

The state of policy, ACTIVE or INACTIVE

property type: str

The type of policy

class datahub.metadata.schema_classes.DataHubPolicyKeyClass(id)

Bases: _Aspect

Key for a DataHub Policy

Parameters:

id (str)

property id: str

A unique id for the DataHub access policy record. Generated on the server side at policy creation time.

class datahub.metadata.schema_classes.DataHubPolicySnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for DataHub Access Policy data.

Parameters:
property aspects: List[DataHubPolicyKeyClass | DataHubPolicyInfoClass]

The list of metadata aspects associated with the DataHub access policy.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.DataHubResourceFilterClass(type=None, resources=None, allResources=None, filter=None)

Bases: DictWrapper

Information used to filter DataHub resource.

Parameters:
  • type (Optional[str])

  • resources (Optional[List[str]])

  • allResources (Optional[bool])

  • filter (Optional[PolicyMatchFilterClass])

property allResources: bool

Whether the policy should be applied to all assets matching the filter.

property filter: None | PolicyMatchFilterClass

Filter to apply privileges to

property resources: None | List[str]

A specific set of resources to apply the policy to, e.g. asset urns

property type: None | str

The type of resource that the policy applies to. This will most often be a data asset entity name, for example ‘dataset’. It is not strictly required because in the future we will want to support filtering a resource by domain, as well.

class datahub.metadata.schema_classes.DataHubRetentionConfigClass(retention)

Bases: _Aspect

Parameters:

retention (RetentionClass)

property retention: RetentionClass
class datahub.metadata.schema_classes.DataHubRetentionKeyClass(entityName, aspectName)

Bases: _Aspect

Key for a DataHub Retention

Parameters:
  • entityName (str)

  • aspectName (str)

property aspectName: str

Aspect name to apply retention to. * (or empty) for applying defaults.

property entityName: str

Entity name to apply retention to. * (or empty) for applying defaults.

class datahub.metadata.schema_classes.DataHubRetentionSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for DataHub Access Policy data.

Parameters:
property aspects: List[DataHubRetentionKeyClass | DataHubRetentionConfigClass]

The list of metadata aspects associated with the DataHub access policy.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.DataHubRoleInfoClass(name, description, editable=None)

Bases: _Aspect

Information about a DataHub Role.

Parameters:
  • name (str)

  • description (str)

  • editable (Optional[bool])

property description: str

Description of the Role

property editable: bool

Whether the role should be editable via the UI

property name: str

Name of the Role

class datahub.metadata.schema_classes.DataHubRoleKeyClass(id)

Bases: _Aspect

Key for a DataHub Role

Parameters:

id (str)

property id: str

A unique id for the DataHub role record. Generated on the server side at role creation time.

class datahub.metadata.schema_classes.DataHubSecretKeyClass(id)

Bases: _Aspect

Key for a DataHub Secret

Parameters:

id (str)

property id: str

A unique id for the Secret

class datahub.metadata.schema_classes.DataHubSecretValueClass(name, value, description=None, created=None)

Bases: _Aspect

The value of a DataHub Secret

Parameters:
  • name (str)

  • value (str)

  • description (Optional[str])

  • created (Optional[AuditStampClass])

property created: None | AuditStampClass

Created Audit stamp

property description: None | str

Description of the secret

property name: str

The display name for the secret

property value: str

The AES-encrypted value of the DataHub secret.

class datahub.metadata.schema_classes.DataHubStepStateKeyClass(id)

Bases: _Aspect

Key for a DataHub Step State

Parameters:

id (str)

property id: str

A unique id for the state

class datahub.metadata.schema_classes.DataHubStepStatePropertiesClass(lastModified, properties=None)

Bases: _Aspect

The properties associated with a DataHub step state

Parameters:
property lastModified: AuditStampClass

Audit stamp describing the last person to update it.

property properties: Dict[str, str]

Description of the secret

class datahub.metadata.schema_classes.DataHubUpgradeKeyClass(id)

Bases: _Aspect

Key for a DataHubUpgrade

Parameters:

id (str)

property id: str
class datahub.metadata.schema_classes.DataHubUpgradeRequestClass(timestampMs, version)

Bases: _Aspect

Information collected when kicking off a DataHubUpgrade

Parameters:
  • timestampMs (int)

  • version (str)

property timestampMs: int

Timestamp when we started this DataHubUpgrade

property version: str

Version of this upgrade

class datahub.metadata.schema_classes.DataHubUpgradeResultClass(timestampMs, result=None)

Bases: _Aspect

Information collected when a DataHubUpgrade successfully finishes

Parameters:
  • timestampMs (int)

  • result (Optional[Dict[str, str]])

property result: None | Dict[str, str]

Result map to place helpful information about this upgrade job

property timestampMs: int

Timestamp when we started this DataHubUpgrade

class datahub.metadata.schema_classes.DataHubViewDefinitionClass(entityTypes, filter)

Bases: DictWrapper

A View definition.

Parameters:
property entityTypes: List[str]

The Entity Types in the scope of the View.

property filter: FilterClass

The filter criteria, which represents the view itself

class datahub.metadata.schema_classes.DataHubViewInfoClass(name, type, definition, created, lastModified, description=None)

Bases: _Aspect

Information about a DataHub View. – TODO: Understand whether an entity type filter is required.

Parameters:
property created: AuditStampClass

Audit stamp capturing the time and actor who created the View.

property definition: DataHubViewDefinitionClass

The view itself

property description: None | str

Description of the view

property lastModified: AuditStampClass

Audit stamp capturing the time and actor who last modified the View.

property name: str

The name of the View

property type: str | DataHubViewTypeClass

The type of View

class datahub.metadata.schema_classes.DataHubViewKeyClass(id)

Bases: _Aspect

Key for a DataHub View

Parameters:

id (str)

property id: str

A unique id for the View

class datahub.metadata.schema_classes.DataHubViewTypeClass

Bases: object

A view private for a specific person.

GLOBAL = 'GLOBAL'
PERSONAL = 'PERSONAL'

A global view, which all users can see and use.

class datahub.metadata.schema_classes.DataJobInfoClass(name, type, customProperties=None, externalUrl=None, description=None, flowUrn=None, created=None, lastModified=None, status=None)

Bases: _Aspect

Information about a Data processing job

Parameters:
property created: None | TimeStampClass

A timestamp documenting when the asset was created in the source Data Platform (not on DataHub)

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Job description

property externalUrl: None | str

URL where the reference exist

property flowUrn: None | str

DataFlow urn that this job is part of

property lastModified: None | TimeStampClass

A timestamp documenting when the asset was last modified in the source Data Platform (not on DataHub)

property name: str

Job name

property status: None | str | JobStatusClass

Status of the job - Deprecated for Data Process Instance model.

property type: str | AzkabanJobTypeClass

Datajob type NOTE*: AzkabanJobType is deprecated. Please use strings instead.

class datahub.metadata.schema_classes.DataJobInputOutputClass(inputDatasets, outputDatasets, inputDatasetEdges=None, outputDatasetEdges=None, inputDatajobs=None, inputDatajobEdges=None, inputDatasetFields=None, outputDatasetFields=None, fineGrainedLineages=None)

Bases: _Aspect

Information about the inputs and outputs of a Data processing job

Parameters:
  • inputDatasets (List[str])

  • outputDatasets (List[str])

  • inputDatasetEdges (Optional[List[EdgeClass]])

  • outputDatasetEdges (Optional[List[EdgeClass]])

  • inputDatajobs (Optional[List[str]])

  • inputDatajobEdges (Optional[List[EdgeClass]])

  • inputDatasetFields (Optional[List[str]])

  • outputDatasetFields (Optional[List[str]])

  • fineGrainedLineages (Optional[List[FineGrainedLineageClass]])

property fineGrainedLineages: None | List[FineGrainedLineageClass]

Fine-grained column-level lineages Not currently supported in the UI Use UpstreamLineage aspect for datasets to express Column Level Lineage for the UI

property inputDatajobEdges: None | List[EdgeClass]

Input datajobs that this data job depends on

property inputDatajobs: None | List[str]

Input datajobs that this data job depends on Deprecated! Use inputDatajobEdges instead.

property inputDatasetEdges: None | List[EdgeClass]

Input datasets consumed by the data job during processing

property inputDatasetFields: None | List[str]

Fields of the input datasets used by this job

property inputDatasets: List[str]

Input datasets consumed by the data job during processing Deprecated! Use inputDatasetEdges instead.

property outputDatasetEdges: None | List[EdgeClass]

Output datasets produced by the data job during processing

property outputDatasetFields: None | List[str]

Fields of the output datasets this job writes to

property outputDatasets: List[str]

Output datasets produced by the data job during processing Deprecated! Use outputDatasetEdges instead.

class datahub.metadata.schema_classes.DataJobKeyClass(flow, jobId)

Bases: _Aspect

Key for a Data Job

Parameters:
  • flow (str)

  • jobId (str)

property flow: str

Standardized data processing flow urn representing the flow for the job

property jobId: str

Unique Identifier of the data job

class datahub.metadata.schema_classes.DataJobSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific DataJob entity.

Parameters:
property aspects: List[DataJobKeyClass | DataJobInfoClass | DataJobInputOutputClass | EditableDataJobPropertiesClass | OwnershipClass | StatusClass | GlobalTagsClass | BrowsePathsClass | GlossaryTermsClass | InstitutionalMemoryClass | DataPlatformInstanceClass | BrowsePathsV2Class]

The list of metadata aspects associated with the data job. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.DataPlatformInfoClass(name, type, datasetNameDelimiter, displayName=None, logoUrl=None)

Bases: _Aspect

Information about a data platform

Parameters:
  • name (str)

  • type (Union[str, PlatformTypeClass])

  • datasetNameDelimiter (str)

  • displayName (Optional[str])

  • logoUrl (Optional[str])

property datasetNameDelimiter: str

The delimiter in the dataset names on the data platform, e.g. ‘/’ for HDFS and ‘.’ for Oracle

property displayName: None | str

The name that will be used for displaying a platform type.

property logoUrl: None | str

The URL for a logo associated with the platform

property name: str

Name of the data platform

property type: str | PlatformTypeClass

Platform type this data platform describes

class datahub.metadata.schema_classes.DataPlatformInstanceClass(platform, instance=None)

Bases: _Aspect

The specific instance of the data platform that this entity belongs to

Parameters:
  • platform (str)

  • instance (Optional[str])

property instance: None | str

Instance of the data platform (e.g. db instance)

property platform: str

Data Platform

class datahub.metadata.schema_classes.DataPlatformInstanceKeyClass(platform, instance)

Bases: _Aspect

Key for a Dataset

Parameters:
  • platform (str)

  • instance (str)

property instance: str

Unique instance id

property platform: str

Data platform urn associated with the instance

class datahub.metadata.schema_classes.DataPlatformInstancePropertiesClass(customProperties=None, externalUrl=None, name=None, description=None)

Bases: _Aspect

Properties associated with a Data Platform Instance

Parameters:
  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • name (Optional[str])

  • description (Optional[str])

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Documentation of the Data Platform Instance

property externalUrl: None | str

URL where the reference exist

property name: None | str

Display name of the Data Platform Instance

class datahub.metadata.schema_classes.DataPlatformKeyClass(platformName)

Bases: _Aspect

Key for a Data Platform

Parameters:

platformName (str)

property platformName: str

Data platform name i.e. hdfs, oracle, espresso

class datahub.metadata.schema_classes.DataPlatformSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific dataplatform entity.

Parameters:
property aspects: List[DataPlatformKeyClass | DataPlatformInfoClass]

The list of metadata aspects associated with the data platform. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.DataProcessInfoClass(inputs=None, outputs=None)

Bases: _Aspect

The inputs and outputs of this data process

Parameters:
  • inputs (Optional[List[str]])

  • outputs (Optional[List[str]])

property inputs: None | List[str]

the inputs of the data process

property outputs: None | List[str]

the outputs of the data process

class datahub.metadata.schema_classes.DataProcessInstanceInputClass(inputs)

Bases: _Aspect

Information about the inputs datasets of a Data process

Parameters:

inputs (List[str])

property inputs: List[str]

Input datasets to be consumed

class datahub.metadata.schema_classes.DataProcessInstanceKeyClass(id)

Bases: _Aspect

Key for an Asset DataProcessInstance

Parameters:

id (str)

property id: str

A unique id for the DataProcessInstance . Should be separate from the name used for displaying a DataProcessInstance.

class datahub.metadata.schema_classes.DataProcessInstanceOutputClass(outputs)

Bases: _Aspect

Information about the outputs of a Data process

Parameters:

outputs (List[str])

property outputs: List[str]

Output datasets to be produced

class datahub.metadata.schema_classes.DataProcessInstancePropertiesClass(name, created, customProperties=None, externalUrl=None, type=None)

Bases: _Aspect

The inputs and outputs of this data process

Parameters:
property created: AuditStampClass

Audit stamp containing who reported the lineage and when

property customProperties: Dict[str, str]

Custom property bag.

property externalUrl: None | str

URL where the reference exist

property name: str

Process name

property type: None | str | DataProcessTypeClass

Process type

class datahub.metadata.schema_classes.DataProcessInstanceRelationshipsClass(upstreamInstances, parentTemplate=None, parentInstance=None)

Bases: _Aspect

Information about Data process relationships

Parameters:
  • upstreamInstances (List[str])

  • parentTemplate (Optional[str])

  • parentInstance (Optional[str])

property parentInstance: None | str

The parent DataProcessInstance where it belongs to. If it is a Airflow Task then it should belong to an Airflow Dag run as well which will be another DataProcessInstance

property parentTemplate: None | str

The parent entity whose run instance it is

property upstreamInstances: List[str]

Input DataProcessInstance which triggered this dataprocess instance

class datahub.metadata.schema_classes.DataProcessInstanceRunEventClass(timestampMillis, status, eventGranularity=None, partitionSpec=None, messageId=None, externalUrl=None, attempt=None, result=None)

Bases: _Aspect

An event representing the current status of data process run. DataProcessRunEvent should be used for reporting the status of a dataProcess’ run.

Parameters:
ASPECT_TYPE: ClassVar[str] = 'timeseries'
property attempt: None | int

Return the try number that this Instance Run is in

property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property externalUrl: None | str

URL where the reference exist

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property result: None | DataProcessInstanceRunResultClass

The final result of the Data Processing run.

property status: str | DataProcessRunStatusClass
property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

class datahub.metadata.schema_classes.DataProcessInstanceRunResultClass(type, nativeResultType)

Bases: DictWrapper

Parameters:
property nativeResultType: str

It identifies the system where the native result comes from like Airflow, Azkaban, etc..

property type: str | RunResultTypeClass

The final result, e.g. SUCCESS, FAILURE, SKIPPED, or UP_FOR_RETRY.

class datahub.metadata.schema_classes.DataProcessKeyClass(name, orchestrator, origin)

Bases: _Aspect

Key for a Data Process

Parameters:
property name: str

Process name i.e. an ETL job name

property orchestrator: str

Standardized Orchestrator where data process is defined. TODO: Migrate towards something that can be validated like DataPlatform urn

property origin: str | FabricTypeClass

Fabric type where dataset belongs to or where it was generated.

class datahub.metadata.schema_classes.DataProcessRunStatusClass

Bases: object

The status where the Data processing run is in.

COMPLETE = 'COMPLETE'
STARTED = 'STARTED'
class datahub.metadata.schema_classes.DataProcessSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific Data process entity.

Parameters:
property aspects: List[DataProcessKeyClass | OwnershipClass | DataProcessInfoClass | StatusClass]

The list of metadata aspects associated with the data process. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.DataProcessTypeClass

Bases: object

BATCH_AD_HOC = 'BATCH_AD_HOC'
BATCH_SCHEDULED = 'BATCH_SCHEDULED'
STREAMING = 'STREAMING'
class datahub.metadata.schema_classes.DataProductAssociationClass(destinationUrn, sourceUrn=None, created=None, lastModified=None, properties=None)

Bases: DictWrapper

Represents an association of assets to a Data Product.

Parameters:
  • destinationUrn (str)

  • sourceUrn (Optional[str])

  • created (Optional[AuditStampClass])

  • lastModified (Optional[AuditStampClass])

  • properties (Optional[Dict[str, str]])

property created: None | AuditStampClass

Audit stamp containing who created this relationship edge and when

property destinationUrn: str

Urn of the destination of this relationship edge.

property lastModified: None | AuditStampClass

Audit stamp containing who last modified this relationship edge and when

property properties: None | Dict[str, str]

A generic properties bag that allows us to store specific information on this graph edge.

property sourceUrn: None | str

Urn of the source of this relationship edge. If not specified, assumed to be the entity that this aspect belongs to.

class datahub.metadata.schema_classes.DataProductKeyClass(id)

Bases: _Aspect

Key for a Query

Parameters:

id (str)

property id: str

A unique id for the Data Product.

class datahub.metadata.schema_classes.DataProductPropertiesClass(customProperties=None, externalUrl=None, name=None, description=None, assets=None)

Bases: _Aspect

The main properties of a Data Product

Parameters:
  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • name (Optional[str])

  • description (Optional[str])

  • assets (Optional[List[DataProductAssociationClass]])

property assets: None | List[DataProductAssociationClass]

A list of assets that are part of this Data Product

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Documentation of the dataset

property externalUrl: None | str

URL where the reference exist

property name: None | str

Display name of the Data Product

class datahub.metadata.schema_classes.DatahubIngestionCheckpointClass(timestampMillis, pipelineName, platformInstanceId, config, state, runId, eventGranularity=None, partitionSpec=None, messageId=None)

Bases: _Aspect

Checkpoint of a datahub ingestion run for a given job.

Parameters:
ASPECT_TYPE: ClassVar[str] = 'timeseries'
property config: str

Json-encoded string representation of the non-secret members of the config .

property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property pipelineName: str

The name of the pipeline that ran ingestion, a stable unique user provided identifier. e.g. my_snowflake1-to-datahub.

property platformInstanceId: str

The id of the instance against which the ingestion pipeline ran. e.g.: Bigquery project ids, MySQL hostnames etc.

property runId: str

The run identifier of this job.

property state: IngestionCheckpointStateClass

Opaque blob of the state representation.

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

class datahub.metadata.schema_classes.DatahubIngestionRunSummaryClass(timestampMillis, pipelineName, platformInstanceId, runId, runStatus, eventGranularity=None, partitionSpec=None, messageId=None, numWorkUnitsCommitted=None, numWorkUnitsCreated=None, numEvents=None, numEntities=None, numAspects=None, numSourceAPICalls=None, totalLatencySourceAPICalls=None, numSinkAPICalls=None, totalLatencySinkAPICalls=None, numWarnings=None, numErrors=None, numEntitiesSkipped=None, config=None, custom_summary=None, softwareVersion=None, systemHostName=None, operatingSystemName=None, numProcessors=None, totalMemory=None, availableMemory=None)

Bases: _Aspect

Summary of a datahub ingestion run for a given platform.

Parameters:
  • timestampMillis (int)

  • pipelineName (str)

  • platformInstanceId (str)

  • runId (str)

  • runStatus (Union[str, JobStatusClass])

  • eventGranularity (Optional[TimeWindowSizeClass])

  • partitionSpec (Optional[PartitionSpecClass])

  • messageId (Optional[str])

  • numWorkUnitsCommitted (Optional[int])

  • numWorkUnitsCreated (Optional[int])

  • numEvents (Optional[int])

  • numEntities (Optional[int])

  • numAspects (Optional[int])

  • numSourceAPICalls (Optional[int])

  • totalLatencySourceAPICalls (Optional[int])

  • numSinkAPICalls (Optional[int])

  • totalLatencySinkAPICalls (Optional[int])

  • numWarnings (Optional[int])

  • numErrors (Optional[int])

  • numEntitiesSkipped (Optional[int])

  • config (Optional[str])

  • custom_summary (Optional[str])

  • softwareVersion (Optional[str])

  • systemHostName (Optional[str])

  • operatingSystemName (Optional[str])

  • numProcessors (Optional[int])

  • totalMemory (Optional[int])

  • availableMemory (Optional[int])

ASPECT_TYPE: ClassVar[str] = 'timeseries'
property availableMemory: None | int

The available memory on the host the ingestion pipeline ran on.

property config: None | str

The non-sensitive key-value pairs of the yaml config used as json string.

property custom_summary: None | str

Custom value.

property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property numAspects: None | int

The total number of aspects produced across all entities.

property numEntities: None | int

The total number of entities produced (unique entity urns).

property numEntitiesSkipped: None | int

Number of entities skipped.

property numErrors: None | int

Number of errors generated.

property numEvents: None | int

The number of events produced (MCE + MCP).

property numProcessors: None | int

The number of processors on the host the ingestion pipeline ran on.

property numSinkAPICalls: None | int

Total number of sink API calls.

property numSourceAPICalls: None | int

Total number of source API calls.

property numWarnings: None | int

Number of warnings generated.

property numWorkUnitsCommitted: None | int

The number of workunits written to sink.

property numWorkUnitsCreated: None | int

The number of workunits that are produced.

property operatingSystemName: None | str

The os the ingestion pipeline ran on.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property pipelineName: str

The name of the pipeline that ran ingestion, a stable unique user provided identifier. e.g. my_snowflake1-to-datahub.

property platformInstanceId: str

The id of the instance against which the ingestion pipeline ran. e.g.: Bigquery project ids, MySQL hostnames etc.

property runId: str

The runId for this pipeline instance.

property runStatus: str | JobStatusClass

Run Status - Succeeded/Skipped/Failed etc.

property softwareVersion: None | str

The software version of this ingestion.

property systemHostName: None | str

The hostname the ingestion pipeline ran on.

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

property totalLatencySinkAPICalls: None | int

Total latency across all sink API calls.

property totalLatencySourceAPICalls: None | int

Total latency across all source API calls.

property totalMemory: None | int

The total amount of memory on the host the ingestion pipeline ran on.

class datahub.metadata.schema_classes.DatasetAssertionInfoClass(dataset, scope, operator, fields=None, aggregation=None, parameters=None, nativeType=None, nativeParameters=None, logic=None)

Bases: DictWrapper

Attributes that are applicable to single-Dataset Assertions

Parameters:
property aggregation: None | str | AssertionStdAggregationClass

Standardized assertion operator

property dataset: str

The dataset targeted by this assertion.

property fields: None | List[str]

One or more dataset schema fields that are targeted by this assertion

property logic: None | str
property nativeParameters: None | Dict[str, str]

Native parameters required for the assertion.

property nativeType: None | str

Native assertion type

property operator: str | AssertionStdOperatorClass

Standardized assertion operator

property parameters: None | AssertionStdParametersClass

Standard parameters required for the assertion. e.g. min_value, max_value, value, columns

property scope: str | DatasetAssertionScopeClass

Scope of the Assertion. What part of the dataset does this assertion apply to?

class datahub.metadata.schema_classes.DatasetAssertionScopeClass

Bases: object

This assertion applies to dataset columns

DATASET_COLUMN = 'DATASET_COLUMN'

This assertion applies to entire rows of the dataset

DATASET_ROWS = 'DATASET_ROWS'

This assertion applies to the schema of the dataset

DATASET_SCHEMA = 'DATASET_SCHEMA'

The scope of the assertion is unknown

UNKNOWN = 'UNKNOWN'
class datahub.metadata.schema_classes.DatasetDeprecationClass(deprecated, note, decommissionTime=None, actor=None)

Bases: _Aspect

Dataset deprecation status Deprecated! This aspect is deprecated in favor of the more-general-purpose ‘Deprecation’ aspect.

Parameters:
  • deprecated (bool)

  • note (str)

  • decommissionTime (Optional[int])

  • actor (Optional[str])

property actor: None | str

The corpuser URN which will be credited for modifying this deprecation content.

property decommissionTime: None | int

The time user plan to decommission this dataset.

property deprecated: bool

Whether the dataset is deprecated by owner.

property note: str

Additional information about the dataset deprecation plan, such as the wiki, doc, RB.

class datahub.metadata.schema_classes.DatasetFieldForeignKeyClass(parentDataset, currentFieldPaths, parentField)

Bases: DictWrapper

For non-urn based foregin keys.

Parameters:
  • parentDataset (str)

  • currentFieldPaths (List[str])

  • parentField (str)

property currentFieldPaths: List[str]

List of fields in hosting(current) SchemaMetadata that conform a foreign key. List can contain a single entry or multiple entries if several entries in hosting schema conform a foreign key in a single parent dataset.

property parentDataset: str

dataset that stores the resource.

property parentField: str

SchemaField@fieldPath that uniquely identify field in parent dataset that this field references.

class datahub.metadata.schema_classes.DatasetFieldMappingClass(created, transformation, sourceFields, destinationField)

Bases: DictWrapper

Representation of mapping between fields in source dataset to the field in destination dataset

Parameters:
property created: AuditStampClass

Audit stamp containing who reported the field mapping and when

property destinationField: str

Destination field which is derived from source fields

property sourceFields: List[str]

Source fields from which the fine grained lineage is derived

property transformation: str | TransformationTypeClass | UDFTransformerClass

Transfomration function between the fields involved

class datahub.metadata.schema_classes.DatasetFieldProfileClass(fieldPath, uniqueCount=None, uniqueProportion=None, nullCount=None, nullProportion=None, min=None, max=None, mean=None, median=None, stdev=None, quantiles=None, distinctValueFrequencies=None, histogram=None, sampleValues=None)

Bases: DictWrapper

Stats corresponding to fields in a dataset

Parameters:
  • fieldPath (str)

  • uniqueCount (Optional[int])

  • uniqueProportion (Optional[float])

  • nullCount (Optional[int])

  • nullProportion (Optional[float])

  • min (Optional[str])

  • max (Optional[str])

  • mean (Optional[str])

  • median (Optional[str])

  • stdev (Optional[str])

  • quantiles (Optional[List[QuantileClass]])

  • distinctValueFrequencies (Optional[List[ValueFrequencyClass]])

  • histogram (Optional[HistogramClass])

  • sampleValues (Optional[List[str]])

property distinctValueFrequencies: None | List[ValueFrequencyClass]
property fieldPath: str
property histogram: None | HistogramClass
property max: None | str
property mean: None | str
property median: None | str
property min: None | str
property nullCount: None | int
property nullProportion: None | float
property quantiles: None | List[QuantileClass]
property sampleValues: None | List[str]
property stdev: None | str
property uniqueCount: None | int
property uniqueProportion: None | float
class datahub.metadata.schema_classes.DatasetFieldUsageCountsClass(fieldPath, count)

Bases: DictWrapper

Records field-level usage counts for a given dataset

Parameters:
  • fieldPath (str)

  • count (int)

property count: int

Number of times the field has been used.

property fieldPath: str

The name of the field.

class datahub.metadata.schema_classes.DatasetKeyClass(platform, name, origin)

Bases: _Aspect

Key for a Dataset

Parameters:
property name: str

Unique guid for dataset

property origin: str | FabricTypeClass

Fabric type where dataset belongs to or where it was generated.

property platform: str

Data platform urn associated with the dataset

class datahub.metadata.schema_classes.DatasetLineageTypeClass

Bases: object

The various types of supported dataset lineage

COPY = 'COPY'

Transformed data with modification (format or content change)

TRANSFORMED = 'TRANSFORMED'

Represents a view defined on the sources e.g. Hive view defined on underlying hive tables or a Hive table pointing to a HDFS dataset or DALI view defined on multiple sources

VIEW = 'VIEW'
class datahub.metadata.schema_classes.DatasetProfileClass(timestampMillis, eventGranularity=None, partitionSpec=None, messageId=None, rowCount=None, columnCount=None, fieldProfiles=None, sizeInBytes=None)

Bases: _Aspect

Stats corresponding to datasets

Parameters:
ASPECT_TYPE: ClassVar[str] = 'timeseries'
property columnCount: None | int

The total number of columns (or schema fields)

property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property fieldProfiles: None | List[DatasetFieldProfileClass]

Profiles for each column (or schema field)

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property rowCount: None | int

The total number of rows

property sizeInBytes: None | int

Storage size in bytes

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

class datahub.metadata.schema_classes.DatasetPropertiesClass(customProperties=None, externalUrl=None, name=None, qualifiedName=None, description=None, uri=None, created=None, lastModified=None, tags=None)

Bases: _Aspect

Properties associated with a Dataset

Parameters:
  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • name (Optional[str])

  • qualifiedName (Optional[str])

  • description (Optional[str])

  • uri (Optional[str])

  • created (Optional[TimeStampClass])

  • lastModified (Optional[TimeStampClass])

  • tags (Optional[List[str]])

property created: None | TimeStampClass

A timestamp documenting when the asset was created in the source Data Platform (not on DataHub)

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Documentation of the dataset

property externalUrl: None | str

URL where the reference exist

property lastModified: None | TimeStampClass

A timestamp documenting when the asset was last modified in the source Data Platform (not on DataHub)

property name: None | str

Display name of the Dataset

property qualifiedName: None | str

Fully-qualified name of the Dataset

property tags: List[str]

[Legacy] Unstructured tags for the dataset. Structured tags can be applied via the GlobalTags aspect. This is now deprecated.

property uri: None | str

///dir/file_name. Uri should not include any environment specific properties. Some datasets might not have a standardized uri, which makes this field optional (i.e. kafka topic).

Type:

The abstracted URI such as hdfs

Type:

///data/tracking/PageViewEvent, file

class datahub.metadata.schema_classes.DatasetSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific dataset entity.

Parameters:
property aspects: List[DatasetKeyClass | DatasetPropertiesClass | EditableDatasetPropertiesClass | DatasetDeprecationClass | DatasetUpstreamLineageClass | UpstreamLineageClass | InstitutionalMemoryClass | OwnershipClass | StatusClass | SchemaMetadataClass | EditableSchemaMetadataClass | GlobalTagsClass | GlossaryTermsClass | BrowsePathsClass | DataPlatformInstanceClass | ViewPropertiesClass | BrowsePathsV2Class]

The list of metadata aspects associated with the dataset. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.DatasetUpstreamLineageClass(fieldMappings)

Bases: _Aspect

Fine Grained upstream lineage for fields in a dataset

Parameters:

fieldMappings (List[DatasetFieldMappingClass])

property fieldMappings: List[DatasetFieldMappingClass]

Upstream to downstream field level lineage mappings

class datahub.metadata.schema_classes.DatasetUsageStatisticsClass(timestampMillis, eventGranularity=None, partitionSpec=None, messageId=None, uniqueUserCount=None, totalSqlQueries=None, topSqlQueries=None, userCounts=None, fieldCounts=None)

Bases: _Aspect

Stats corresponding to dataset’s usage.

Parameters:
ASPECT_TYPE: ClassVar[str] = 'timeseries'
property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property fieldCounts: None | List[DatasetFieldUsageCountsClass]

Field-level usage stats

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

property topSqlQueries: None | List[str]

Frequent SQL queries; mostly makes sense for datasets in SQL databases

property totalSqlQueries: None | int

Total SQL query count

property uniqueUserCount: None | int

Unique user count

property userCounts: None | List[DatasetUserUsageCountsClass]

Users within this bucket, with frequency counts

class datahub.metadata.schema_classes.DatasetUserUsageCountsClass(user, count, userEmail=None)

Bases: DictWrapper

Records a single user’s usage counts for a given resource

Parameters:
  • user (str)

  • count (int)

  • userEmail (Optional[str])

property count: int

Number of times the dataset has been used by the user.

property user: str

The unique id of the user.

property userEmail: None | str

If user_email is set, we attempt to resolve the user’s urn upon ingest

class datahub.metadata.schema_classes.DateTypeClass

Bases: DictWrapper

Date field type.

class datahub.metadata.schema_classes.DeploymentStatusClass

Bases: object

Model endpoint statuses

CREATING = 'CREATING'

Deployments being updated.

DELETING = 'DELETING'

Deployments with an error state.

FAILED = 'FAILED'

Deployments with unknown/unmappable state.

IN_SERVICE = 'IN_SERVICE'

Deployments being deleted.

OUT_OF_SERVICE = 'OUT_OF_SERVICE'

Deployments being created.

ROLLING_BACK = 'ROLLING_BACK'

Deployments that are active.

UNKNOWN = 'UNKNOWN'
UPDATING = 'UPDATING'

Deployments being reverted to a previous version.

class datahub.metadata.schema_classes.DeprecationClass(deprecated, note, actor, decommissionTime=None)

Bases: _Aspect

Deprecation status of an entity

Parameters:
  • deprecated (bool)

  • note (str)

  • actor (str)

  • decommissionTime (Optional[int])

property actor: str

The user URN which will be credited for modifying this deprecation content.

property decommissionTime: None | int

The time user plan to decommission this entity.

property deprecated: bool

Whether the entity is deprecated.

property note: str

Additional information about the entity deprecation plan, such as the wiki, doc, RB.

class datahub.metadata.schema_classes.DomainKeyClass(id)

Bases: _Aspect

Key for an Asset Domain

Parameters:

id (str)

property id: str

A unique id for the domain. Should be separate from the name used for displaying a Domain.

class datahub.metadata.schema_classes.DomainPropertiesClass(name, description=None, created=None)

Bases: _Aspect

Information about a Domain

Parameters:
  • name (str)

  • description (Optional[str])

  • created (Optional[AuditStampClass])

property created: None | AuditStampClass

Created Audit stamp

property description: None | str

Description of the Domain

property name: str

Display name of the Domain

class datahub.metadata.schema_classes.DomainsClass(domains)

Bases: _Aspect

Links from an Asset to its Domains

Parameters:

domains (List[str])

property domains: List[str]

The Domains attached to an Asset

class datahub.metadata.schema_classes.EdgeClass(destinationUrn, sourceUrn=None, created=None, lastModified=None, properties=None)

Bases: DictWrapper

A common structure to represent all edges to entities when used inside aspects as collections This ensures that all edges have common structure around audit-stamps and will support PATCH, time-travel automatically.

Parameters:
  • destinationUrn (str)

  • sourceUrn (Optional[str])

  • created (Optional[AuditStampClass])

  • lastModified (Optional[AuditStampClass])

  • properties (Optional[Dict[str, str]])

property created: None | AuditStampClass

Audit stamp containing who created this relationship edge and when

property destinationUrn: str

Urn of the destination of this relationship edge.

property lastModified: None | AuditStampClass

Audit stamp containing who last modified this relationship edge and when

property properties: None | Dict[str, str]

A generic properties bag that allows us to store specific information on this graph edge.

property sourceUrn: None | str

Urn of the source of this relationship edge. If not specified, assumed to be the entity that this aspect belongs to.

class datahub.metadata.schema_classes.EditableChartPropertiesClass(created=None, lastModified=None, deleted=None, description=None)

Bases: _Aspect

Stores editable changes made to properties. This separates changes made from ingestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines

Parameters:
property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property description: None | str

Edited documentation of the chart

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

class datahub.metadata.schema_classes.EditableContainerPropertiesClass(description=None)

Bases: _Aspect

Editable information about an Asset Container as defined on the DataHub Platform

Parameters:

description (Optional[str])

property description: None | str

Description of the Asset Container as its received on the DataHub Platform

class datahub.metadata.schema_classes.EditableDashboardPropertiesClass(created=None, lastModified=None, deleted=None, description=None)

Bases: _Aspect

Stores editable changes made to properties. This separates changes made from ingestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines

Parameters:
property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property description: None | str

Edited documentation of the dashboard

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

class datahub.metadata.schema_classes.EditableDataFlowPropertiesClass(created=None, lastModified=None, deleted=None, description=None)

Bases: _Aspect

Stores editable changes made to properties. This separates changes made from ingestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines

Parameters:
property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property description: None | str

Edited documentation of the data flow

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

class datahub.metadata.schema_classes.EditableDataJobPropertiesClass(created=None, lastModified=None, deleted=None, description=None)

Bases: _Aspect

Stores editable changes made to properties. This separates changes made from ingestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines

Parameters:
property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property description: None | str

Edited documentation of the data job

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

class datahub.metadata.schema_classes.EditableDatasetPropertiesClass(created=None, lastModified=None, deleted=None, description=None)

Bases: _Aspect

EditableDatasetProperties stores editable changes made to dataset properties. This separates changes made from ingestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines

Parameters:
property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property description: None | str

Documentation of the dataset

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

class datahub.metadata.schema_classes.EditableMLFeaturePropertiesClass(description=None)

Bases: _Aspect

Properties associated with a MLFeature editable from the UI

Parameters:

description (Optional[str])

property description: None | str

Documentation of the MLFeature

class datahub.metadata.schema_classes.EditableMLFeatureTablePropertiesClass(description=None)

Bases: _Aspect

Properties associated with a MLFeatureTable editable from the ui

Parameters:

description (Optional[str])

property description: None | str

Documentation of the MLFeatureTable

class datahub.metadata.schema_classes.EditableMLModelGroupPropertiesClass(description=None)

Bases: _Aspect

Properties associated with an ML Model Group editable from the UI

Parameters:

description (Optional[str])

property description: None | str

Documentation of the ml model group

class datahub.metadata.schema_classes.EditableMLModelPropertiesClass(description=None)

Bases: _Aspect

Properties associated with a ML Model editable from the UI

Parameters:

description (Optional[str])

property description: None | str

Documentation of the ml model

class datahub.metadata.schema_classes.EditableMLPrimaryKeyPropertiesClass(description=None)

Bases: _Aspect

Properties associated with a MLPrimaryKey editable from the UI

Parameters:

description (Optional[str])

property description: None | str

Documentation of the MLPrimaryKey

class datahub.metadata.schema_classes.EditableNotebookPropertiesClass(created=None, lastModified=None, deleted=None, description=None)

Bases: _Aspect

Stores editable changes made to properties. This separates changes made from ingestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines Note: This is IN BETA version

Parameters:
property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property description: None | str

Edited documentation of the Notebook

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

class datahub.metadata.schema_classes.EditableSchemaFieldInfoClass(fieldPath, description=None, globalTags=None, glossaryTerms=None)

Bases: DictWrapper

SchemaField to describe metadata related to dataset schema.

Parameters:
property description: None | str

Description

property fieldPath: str

FieldPath uniquely identifying the SchemaField this metadata is associated with

property globalTags: None | GlobalTagsClass

Tags associated with the field

property glossaryTerms: None | GlossaryTermsClass

Glossary terms associated with the field

class datahub.metadata.schema_classes.EditableSchemaMetadataClass(editableSchemaFieldInfo, created=None, lastModified=None, deleted=None)

Bases: _Aspect

EditableSchemaMetadata stores editable changes made to schema metadata. This separates changes made from ingestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines.

Parameters:
property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property editableSchemaFieldInfo: List[EditableSchemaFieldInfoClass]

Client provided a list of fields from document schema.

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

class datahub.metadata.schema_classes.EmbedClass(renderUrl=None)

Bases: _Aspect

Information regarding rendering an embed for an asset.

Parameters:

renderUrl (Optional[str])

property renderUrl: None | str

An embed URL to be rendered inside of an iframe.

class datahub.metadata.schema_classes.EntityChangeEventClass(entityType, entityUrn, category, operation, auditStamp, version, modifier=None, parameters=None)

Bases: DictWrapper

Shared fields for all entity change events.

Parameters:
  • entityType (str)

  • entityUrn (str)

  • category (str)

  • operation (str)

  • auditStamp (AuditStampClass)

  • version (int)

  • modifier (Optional[str])

  • parameters (Optional[ParametersClass])

property auditStamp: AuditStampClass

Audit stamp of the operation

property category: str

The category type (TAG, GLOSSARY_TERM, OWNERSHIP, TECHNICAL_SCHEMA, etc). This is used to determine what the rest of the schema will look like.

property entityType: str

The type of the entity affected. Corresponds to the entity registry, e.g. ‘dataset’, ‘chart’, ‘dashboard’, etc.

property entityUrn: str

The urn of the entity which was affected.

property modifier: None | str

The urn of the entity which was affected.

property operation: str

The operation type. This is used to determine what the rest of the schema will look like.

property parameters: None | ParametersClass

Arbitrary key-value parameters corresponding to the event.

property version: int

The version of the event type, incremented in integers.

class datahub.metadata.schema_classes.EnumTypeClass

Bases: DictWrapper

Enum field type.

class datahub.metadata.schema_classes.EspressoSchemaClass(documentSchema, tableSchema)

Bases: DictWrapper

Schema text of an espresso table schema.

Parameters:
  • documentSchema (str)

  • tableSchema (str)

property documentSchema: str

The native espresso document schema.

property tableSchema: str

The espresso table schema definition.

class datahub.metadata.schema_classes.EthicalConsiderationsClass(data=None, humanLife=None, mitigations=None, risksAndHarms=None, useCases=None)

Bases: _Aspect

This section is intended to demonstrate the ethical considerations that went into MLModel development, surfacing ethical challenges and solutions to stakeholders.

Parameters:
  • data (Optional[List[str]])

  • humanLife (Optional[List[str]])

  • mitigations (Optional[List[str]])

  • risksAndHarms (Optional[List[str]])

  • useCases (Optional[List[str]])

property data: None | List[str]

Does the MLModel use any sensitive data (e.g., protected classes)?

property humanLife: None | List[str]

Is the MLModel intended to inform decisions about matters central to human life or flourishing - e.g., health or safety? Or could it be used in such a way?

property mitigations: None | List[str]

What risk mitigation strategies were used during MLModel development?

property risksAndHarms: None | List[str]

What risks may be present in MLModel usage? Try to identify the potential recipients, likelihood, and magnitude of harms. If these cannot be determined, note that they were considered but remain unknown.

property useCases: None | List[str]

Are there any known MLModel use cases that are especially fraught? This may connect directly to the intended use section

class datahub.metadata.schema_classes.EvaluationDataClass(evaluationData)

Bases: _Aspect

All referenced datasets would ideally point to any set of documents that provide visibility into the source and composition of the dataset.

Parameters:

evaluationData (List[BaseDataClass])

property evaluationData: List[BaseDataClass]

Details on the dataset(s) used for the quantitative analyses in the MLModel

class datahub.metadata.schema_classes.ExecutionRequestInputClass(task, args, executorId, source, requestedAt)

Bases: _Aspect

An request to execution some remote logic or action. TODO: Determine who is responsible for emitting execution request success or failure. Executor?

Parameters:
property args: Dict[str, str]

Arguments provided to the task

property executorId: str

specify a specific executor to route the request to. If none is provided, a “default” executor is used.

Type:

Advanced

property requestedAt: int

Time at which the execution request input was created

property source: ExecutionRequestSourceClass

Source which created the execution request

property task: str

The name of the task to execute, for example RUN_INGEST

class datahub.metadata.schema_classes.ExecutionRequestKeyClass(id)

Bases: _Aspect

Key for an DataHub Execution Request

Parameters:

id (str)

property id: str

A unique id for the DataHub execution request.

class datahub.metadata.schema_classes.ExecutionRequestResultClass(status, report=None, structuredReport=None, startTimeMs=None, durationMs=None)

Bases: _Aspect

The result of an execution request

Parameters:
  • status (str)

  • report (Optional[str])

  • structuredReport (Optional[StructuredExecutionReportClass])

  • startTimeMs (Optional[int])

  • durationMs (Optional[int])

property durationMs: None | int

Duration in milliseconds

property report: None | str

The pretty-printed execution report.

property startTimeMs: None | int

Time at which the request was created

property status: str

The status of the execution request

property structuredReport: None | StructuredExecutionReportClass

A structured report if available.

class datahub.metadata.schema_classes.ExecutionRequestSignalClass(signal, createdAt, executorId=None)

Bases: _Aspect

An signal sent to a running execution request

Parameters:
property createdAt: AuditStampClass

Audit Stamp

property executorId: None | str

specify a specific executor to route the request to. If none is provided, a “default” executor is used.

Type:

Advanced

property signal: str

The signal to issue, e.g. KILL

class datahub.metadata.schema_classes.ExecutionRequestSourceClass(type, ingestionSource=None)

Bases: DictWrapper

Parameters:
  • type (str)

  • ingestionSource (Optional[str])

property ingestionSource: None | str

The urn of the ingestion source associated with the ingestion request. Present if type is INGESTION_SOURCE

property type: str

The type of the execution request source, e.g. INGESTION_SOURCE

class datahub.metadata.schema_classes.FabricTypeClass

Bases: object

Fabric group type

CORP = 'CORP'
DEV = 'DEV'

Designates testing fabrics

EI = 'EI'

Designates pre-production fabrics

NON_PROD = 'NON_PROD'

Designates production fabrics

PRE = 'PRE'

Designates staging fabrics

PROD = 'PROD'

Designates corporation fabrics

QA = 'QA'

Designates user acceptance testing fabrics

STG = 'STG'

Designates non-production fabrics

TEST = 'TEST'

Designates quality assurance fabrics

UAT = 'UAT'

Designates early-integration fabrics

class datahub.metadata.schema_classes.FieldUsageCountsClass(fieldName, count)

Bases: DictWrapper

Records field-level usage counts for a given resource

Parameters:
  • fieldName (str)

  • count (int)

property count: int
property fieldName: str
class datahub.metadata.schema_classes.FilterClass(or_=None, criteria=None)

Bases: DictWrapper

The filter for finding a record or a collection of records

Parameters:
property criteria: None | List[CriterionClass]

Deprecated! A list of conjunctive criterion for the filter. If “or” field is provided, then this field is ignored.

property or_: None | List[ConjunctiveCriterionClass]

A list of disjunctive criterion for the filter. (or operation to combine filters)

class datahub.metadata.schema_classes.FineGrainedLineageClass(upstreamType, downstreamType, upstreams=None, downstreams=None, transformOperation=None, confidenceScore=None)

Bases: DictWrapper

A fine-grained lineage from upstream fields/datasets to downstream field(s)

Parameters:
property confidenceScore: float

The confidence in this lineage between 0 (low confidence) and 1 (high confidence)

property downstreamType: str | FineGrainedLineageDownstreamTypeClass

The type of downstream field(s)

property downstreams: None | List[str]

Downstream fields in the lineage

property transformOperation: None | str

The transform operation applied to the upstream entities to produce the downstream field(s)

property upstreamType: str | FineGrainedLineageUpstreamTypeClass

The type of upstream entity

property upstreams: None | List[str]

Upstream entities in the lineage

class datahub.metadata.schema_classes.FineGrainedLineageDownstreamTypeClass

Bases: object

The type of downstream field(s) in a fine-grained lineage

FIELD = 'FIELD'

Indicates that the lineage is for a set of downstream fields

FIELD_SET = 'FIELD_SET'
class datahub.metadata.schema_classes.FineGrainedLineageUpstreamTypeClass

Bases: object

The type of upstream entity in a fine-grained lineage

DATASET = 'DATASET'

Indicates that there is no upstream lineage i.e. the downstream field is not a derived field

FIELD_SET = 'FIELD_SET'

Indicates that this lineage is originating from upstream dataset(s)

NONE = 'NONE'
class datahub.metadata.schema_classes.FixedTypeClass

Bases: DictWrapper

Fixed field type.

class datahub.metadata.schema_classes.ForeignKeyConstraintClass(name, foreignFields, sourceFields, foreignDataset)

Bases: DictWrapper

Description of a foreign key constraint in a schema.

Parameters:
  • name (str)

  • foreignFields (List[str])

  • sourceFields (List[str])

  • foreignDataset (str)

property foreignDataset: str

Reference to the foreign dataset for ease of lookup

property foreignFields: List[str]

Fields the constraint maps to on the foreign dataset

property name: str

Name of the constraint, likely provided from the source

property sourceFields: List[str]

Fields the constraint maps to on the source dataset

class datahub.metadata.schema_classes.ForeignKeySpecClass(foreignKey)

Bases: DictWrapper

Description of a foreign key in a schema.

Parameters:

foreignKey (Union[DatasetFieldForeignKeyClass, UrnForeignKeyClass])

property foreignKey: DatasetFieldForeignKeyClass | UrnForeignKeyClass

Foreign key definition in metadata schema.

class datahub.metadata.schema_classes.GenericAspectClass(value, contentType)

Bases: DictWrapper

Generic record structure for serializing an Aspect

Parameters:
  • value (bytes)

  • contentType (str)

property contentType: str

The content type, which represents the fashion in which the aspect was serialized. The only type currently supported is application/json.

property value: bytes

The value of the aspect, serialized as bytes.

class datahub.metadata.schema_classes.GenericPayloadClass(value, contentType)

Bases: DictWrapper

Generic payload record structure for serializing a Platform Event.

Parameters:
  • value (bytes)

  • contentType (str)

property contentType: str

The content type, which represents the fashion in which the event was serialized. The only type currently supported is application/json.

property value: bytes

The value of the event, serialized as bytes.

class datahub.metadata.schema_classes.GlobalSettingsInfoClass(views=None)

Bases: _Aspect

DataHub Global platform settings. Careful - these should not be modified by the outside world!

Parameters:

views (Optional[GlobalViewsSettingsClass])

property views: None | GlobalViewsSettingsClass

Settings related to the Views Feature

class datahub.metadata.schema_classes.GlobalSettingsKeyClass(id)

Bases: _Aspect

Key for a Global Settings

Parameters:

id (str)

property id: str

li:globalSettings:0

Type:

Id for the settings. There should be only 1 global settings urn

Type:

urn

class datahub.metadata.schema_classes.GlobalTagsClass(tags)

Bases: _Aspect

Tag aspect used for applying tags to an entity

Parameters:

tags (List[TagAssociationClass])

property tags: List[TagAssociationClass]

Tags associated with a given entity

class datahub.metadata.schema_classes.GlobalViewsSettingsClass(defaultView=None)

Bases: DictWrapper

Settings for DataHub Views feature.

Parameters:

defaultView (Optional[str])

property defaultView: None | str

The default View for the instance, or organization.

class datahub.metadata.schema_classes.GlossaryNodeInfoClass(definition, parentNode=None, name=None, id=None)

Bases: _Aspect

Properties associated with a GlossaryNode

Parameters:
  • definition (str)

  • parentNode (Optional[str])

  • name (Optional[str])

  • id (Optional[str])

property definition: str

Definition of business node

property id: None | str

Optional id for the GlossaryNode

property name: None | str

Display name of the node

property parentNode: None | str

Parent node of the glossary term

class datahub.metadata.schema_classes.GlossaryNodeKeyClass(name)

Bases: _Aspect

Key for a GlossaryNode

Parameters:

name (str)

property name: str
class datahub.metadata.schema_classes.GlossaryNodeSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific GlossaryNode entity.

Parameters:
property aspects: List[GlossaryNodeKeyClass | GlossaryNodeInfoClass | OwnershipClass | StatusClass]

The list of metadata aspects associated with the GlossaryNode. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.GlossaryRelatedTermsClass(isRelatedTerms=None, hasRelatedTerms=None, values=None, relatedTerms=None)

Bases: _Aspect

Has A / Is A lineage information about a glossary Term reporting the lineage

Parameters:
  • isRelatedTerms (Optional[List[str]])

  • hasRelatedTerms (Optional[List[str]])

  • values (Optional[List[str]])

  • relatedTerms (Optional[List[str]])

property hasRelatedTerms: None | List[str]

The relationship Has A with glossary term

property isRelatedTerms: None | List[str]

The relationship Is A with glossary term

property relatedTerms: None | List[str]

The relationship isRelatedTo with glossary term

property values: None | List[str]

The relationship Has Value with glossary term. These are fixed value a term has. For example a ColorEnum where RED, GREEN and YELLOW are fixed values.

class datahub.metadata.schema_classes.GlossaryTermAssociationClass(urn, context=None)

Bases: DictWrapper

Properties of an applied glossary term.

Parameters:
  • urn (str)

  • context (Optional[str])

property context: None | str

Additional context about the association

property urn: str

Urn of the applied glossary term

class datahub.metadata.schema_classes.GlossaryTermInfoClass(definition, termSource, customProperties=None, id=None, name=None, parentNode=None, sourceRef=None, sourceUrl=None, rawSchema=None)

Bases: _Aspect

Properties associated with a GlossaryTerm

Parameters:
  • definition (str)

  • termSource (str)

  • customProperties (Optional[Dict[str, str]])

  • id (Optional[str])

  • name (Optional[str])

  • parentNode (Optional[str])

  • sourceRef (Optional[str])

  • sourceUrl (Optional[str])

  • rawSchema (Optional[str])

property customProperties: Dict[str, str]

Custom property bag.

property definition: str

Definition of business term.

property id: None | str

Optional id for the term

property name: None | str

Display name of the term

property parentNode: None | str

Parent node of the glossary term

property rawSchema: None | str

Schema definition of the glossary term

property sourceRef: None | str

External Reference to the business-term

property sourceUrl: None | str

//spec.edmcouncil.org/fibo/ontology/FBC/FinancialInstruments/FinancialInstruments/CashInstrument.

Type:

The abstracted URL such as https

property termSource: str

Source of the Business Term (INTERNAL or EXTERNAL) with default value as INTERNAL

class datahub.metadata.schema_classes.GlossaryTermKeyClass(name)

Bases: _Aspect

Key for a GlossaryTerm

Parameters:

name (str)

property name: str

The term name, which serves as a unique id

class datahub.metadata.schema_classes.GlossaryTermSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific GlossaryTerm entity.

Parameters:
property aspects: List[GlossaryTermKeyClass | GlossaryTermInfoClass | OwnershipClass | StatusClass | BrowsePathsClass | GlossaryRelatedTermsClass]

The list of metadata aspects associated with the GlossaryTerm. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.GlossaryTermsClass(terms, auditStamp)

Bases: _Aspect

Related business terms information

Parameters:
property auditStamp: AuditStampClass

Audit stamp containing who reported the related business term

property terms: List[GlossaryTermAssociationClass]

The related business terms

class datahub.metadata.schema_classes.GroupMembershipClass(groups)

Bases: _Aspect

Carries information about the CorpGroups a user is in.

Parameters:

groups (List[str])

property groups: List[str]
class datahub.metadata.schema_classes.HistogramClass(boundaries, heights)

Bases: DictWrapper

Parameters:
  • boundaries (List[str])

  • heights (List[float])

property boundaries: List[str]
property heights: List[float]
class datahub.metadata.schema_classes.IngestionCheckpointStateClass(formatVersion, serde, payload=None)

Bases: DictWrapper

The checkpoint state object of a datahub ingestion run for a given job.

Parameters:
  • formatVersion (str)

  • serde (str)

  • payload (Optional[bytes])

property formatVersion: str

The version of the state format.

property payload: None | bytes

Opaque blob of the state representation.

property serde: str

The serialization/deserialization protocol.

class datahub.metadata.schema_classes.InputFieldClass(schemaFieldUrn, schemaField=None)

Bases: DictWrapper

Information about a field a chart or dashboard references

Parameters:
property schemaField: None | SchemaFieldClass

Copied version of the referenced schema field object for indexing purposes

property schemaFieldUrn: str

Urn of the schema being referenced for lineage purposes

class datahub.metadata.schema_classes.InputFieldsClass(fields)

Bases: _Aspect

Information about the fields a chart or dashboard references

Parameters:

fields (List[InputFieldClass])

property fields: List[InputFieldClass]

List of fields being referenced

class datahub.metadata.schema_classes.InstitutionalMemoryClass(elements)

Bases: _Aspect

Institutional memory of an entity. This is a way to link to relevant documentation and provide description of the documentation. Institutional or tribal knowledge is very important for users to leverage the entity.

Parameters:

elements (List[InstitutionalMemoryMetadataClass])

property elements: List[InstitutionalMemoryMetadataClass]

List of records that represent institutional memory of an entity. Each record consists of a link, description, creator and timestamps associated with that record.

class datahub.metadata.schema_classes.InstitutionalMemoryMetadataClass(url, description, createStamp)

Bases: DictWrapper

Metadata corresponding to a record of institutional memory.

Parameters:
property createStamp: AuditStampClass

Audit stamp associated with creation of this record

property description: str

Description of the link.

property url: str

Link to an engineering design document or a wiki page.

class datahub.metadata.schema_classes.IntendedUseClass(primaryUses=None, primaryUsers=None, outOfScopeUses=None)

Bases: _Aspect

Intended Use for the ML Model

Parameters:
  • primaryUses (Optional[List[str]])

  • primaryUsers (Optional[List[Union[str, IntendedUserTypeClass]]])

  • outOfScopeUses (Optional[List[str]])

property outOfScopeUses: None | List[str]

Highlight technology that the MLModel might easily be confused with, or related contexts that users could try to apply the MLModel to.

property primaryUsers: None | List[str | IntendedUserTypeClass]

Primary Intended Users - For example, was the MLModel developed for entertainment purposes, for hobbyists, or enterprise solutions?

property primaryUses: None | List[str]

Primary Use cases for the MLModel.

class datahub.metadata.schema_classes.IntendedUserTypeClass

Bases: object

ENTERPRISE = 'ENTERPRISE'
ENTERTAINMENT = 'ENTERTAINMENT'
HOBBY = 'HOBBY'
class datahub.metadata.schema_classes.InviteTokenClass(token, role=None)

Bases: _Aspect

Aspect used to store invite tokens.

Parameters:
  • token (str)

  • role (Optional[str])

property role: None | str

The role that this invite token may be associated with

property token: str

The encrypted invite token.

class datahub.metadata.schema_classes.InviteTokenKeyClass(id)

Bases: _Aspect

Key for an InviteToken.

Parameters:

id (str)

property id: str

A unique id for the invite token.

class datahub.metadata.schema_classes.JobStatusClass

Bases: object

Job statuses

COMPLETED = 'COMPLETED'

Jobs that have failed.

FAILED = 'FAILED'

Jobs with unknown status (either unmappable or unavailable)

IN_PROGRESS = 'IN_PROGRESS'

Jobs being stopped.

SKIPPED = 'SKIPPED'
STARTING = 'STARTING'

Jobs currently running.

STOPPED = 'STOPPED'

Jobs with successful completion.

STOPPING = 'STOPPING'

Jobs that have stopped.

UNKNOWN = 'UNKNOWN'

Jobs that have been skipped.

class datahub.metadata.schema_classes.KafkaAuditHeaderClass(time, server, appName, messageId, instance=None, auditVersion=None, fabricUrn=None, clusterConnectionString=None)

Bases: DictWrapper

This header records information about the context of an event as it is emitted into kafka and is intended to be used by the kafka audit application. For more information see go/kafkaauditheader

Parameters:
  • time (int)

  • server (str)

  • appName (str)

  • messageId (bytes)

  • instance (Optional[str])

  • auditVersion (Optional[int])

  • fabricUrn (Optional[str])

  • clusterConnectionString (Optional[str])

property appName: str

The name of the application from which the event is being emitted. see go/appname

property auditVersion: None | int

if the schema has an outer KafkaAuditHeader, use the outer audit header timestamp for bucketing; else if the EventHeader has an inner KafkaAuditHeader use that inner audit header’s timestamp for bucketing

Type:

The version that is being used for auditing. In version 0, the audit trail buckets events into 10 minute audit windows based on the EventHeader timestamp. In version 1, the audit trail buckets events as follows

property clusterConnectionString: None | str

This is a String that the client uses to establish some kind of connection with the Kafka cluster. The exact format of it depends on specific versions of clients and brokers. This information could potentially identify the fabric and cluster with which the client is producing to or consuming from.

property fabricUrn: None | str

fabric:{fabric_name}. See go/fabric.

Type:

The fabricUrn of the host from which the event is being emitted. Fabric Urn in the format of urn

Type:

li

property instance: None | str

The instance on the server from which the event is being emitted. e.g. i001

property messageId: bytes

A unique identifier for the message

property server: str

The fully qualified name of the host from which the event is being emitted.

property time: int

The time at which the event was emitted into kafka.

class datahub.metadata.schema_classes.KafkaSchemaClass(documentSchema, documentSchemaType=None, keySchema=None, keySchemaType=None)

Bases: DictWrapper

Schema holder for kafka schema.

Parameters:
  • documentSchema (str)

  • documentSchemaType (Optional[str])

  • keySchema (Optional[str])

  • keySchemaType (Optional[str])

property documentSchema: str

The native kafka document schema. This is a human readable avro document schema.

property documentSchemaType: None | str

The native kafka document schema type. This can be AVRO/PROTOBUF/JSON.

property keySchema: None | str

The native kafka key schema as retrieved from Schema Registry

property keySchemaType: None | str

The native kafka key schema type. This can be AVRO/PROTOBUF/JSON.

class datahub.metadata.schema_classes.KeyValueSchemaClass(keySchema, valueSchema)

Bases: DictWrapper

Schema text of a key-value store schema.

Parameters:
  • keySchema (str)

  • valueSchema (str)

property keySchema: str

The raw schema for the key in the key-value store.

property valueSchema: str

The raw schema for the value in the key-value store.

class datahub.metadata.schema_classes.MLFeatureDataTypeClass

Bases: object

MLFeature Data Type

AUDIO = 'AUDIO'

Text Data

BINARY = 'BINARY'

Count data is discrete whole number data - no negative numbers here. Count data often has many small values, such as zero and one.

BYTE = 'BYTE'

Unknown data are data that we don’t know the type for.

CONTINUOUS = 'CONTINUOUS'

Bytes data are binary-encoded values that can represent complex objects.

COUNT = 'COUNT'

Time data is a cyclical, repeating continuous form of data. The relevant time features can be any period- daily, weekly, monthly, annual, etc.

IMAGE = 'IMAGE'

Video Data

INTERVAL = 'INTERVAL'

Image Data

MAP = 'MAP'

list, tuple, range

Type:

Sequence Data Type ex

NOMINAL = 'NOMINAL'

Ordinal data are discrete integers that can be ranked or sorted. For example, the distance between first and second may not be the same as the distance between second and third.

ORDINAL = 'ORDINAL'

Binary data is discrete data that can be in only one of two categories - either yes or no, 1 or 0, off or on, etc

SEQUENCE = 'SEQUENCE'

set, frozenset

Type:

Set Data Type ex

SET = 'SET'

Continuous data are made of uncountable values, often the result of a measurement such as height, weight, age etc.

TEXT = 'TEXT'

dict, map

Type:

Mapping Data Type ex

TIME = 'TIME'

Interval data has equal spaces between the numbers and does not represent a temporal pattern. Examples include percentages, temperatures, and income.

UNKNOWN = 'UNKNOWN'
USELESS = 'USELESS'

Nominal data is made of discrete values with no numerical relationship between the different categories - mean and median are meaningless. Animal species is one example. For example, pig is not higher than bird and lower than fish.

VIDEO = 'VIDEO'

Audio Data

class datahub.metadata.schema_classes.MLFeatureKeyClass(featureNamespace, name)

Bases: _Aspect

Key for an MLFeature

Parameters:
  • featureNamespace (str)

  • name (str)

property featureNamespace: str

Namespace for the feature

property name: str

Name of the feature

class datahub.metadata.schema_classes.MLFeaturePropertiesClass(description=None, dataType=None, version=None, sources=None)

Bases: _Aspect

Properties associated with a MLFeature

Parameters:
property dataType: None | str | MLFeatureDataTypeClass

Data Type of the MLFeature

property description: None | str

Documentation of the MLFeature

property sources: None | List[str]

Source of the MLFeature

property version: None | VersionTagClass

Version of the MLFeature

class datahub.metadata.schema_classes.MLFeatureSnapshotClass(urn, aspects)

Bases: DictWrapper

Parameters:
property aspects: List[MLFeatureKeyClass | MLFeaturePropertiesClass | OwnershipClass | InstitutionalMemoryClass | StatusClass | DeprecationClass | BrowsePathsClass | GlobalTagsClass | DataPlatformInstanceClass | BrowsePathsV2Class]

The list of metadata aspects associated with the MLFeature. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.MLFeatureTableKeyClass(platform, name)

Bases: _Aspect

Key for an MLFeatureTable

Parameters:
  • platform (str)

  • name (str)

property name: str

Name of the feature table

property platform: str

Data platform urn associated with the feature table

class datahub.metadata.schema_classes.MLFeatureTablePropertiesClass(customProperties=None, description=None, mlFeatures=None, mlPrimaryKeys=None)

Bases: _Aspect

Properties associated with a MLFeatureTable

Parameters:
  • customProperties (Optional[Dict[str, str]])

  • description (Optional[str])

  • mlFeatures (Optional[List[str]])

  • mlPrimaryKeys (Optional[List[str]])

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Documentation of the MLFeatureTable

property mlFeatures: None | List[str]

List of features contained in the feature table

property mlPrimaryKeys: None | List[str]

List of primary keys in the feature table (if multiple, assumed to act as a composite key)

class datahub.metadata.schema_classes.MLFeatureTableSnapshotClass(urn, aspects)

Bases: DictWrapper

Parameters:
property aspects: List[MLFeatureTableKeyClass | MLFeatureTablePropertiesClass | OwnershipClass | InstitutionalMemoryClass | StatusClass | DeprecationClass | BrowsePathsClass | GlobalTagsClass | DataPlatformInstanceClass | BrowsePathsV2Class]

The list of metadata aspects associated with the MLFeatureTable. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.MLHyperParamClass(name, description=None, value=None, createdAt=None)

Bases: _Aspect

Properties associated with an ML Hyper Param

Parameters:
  • name (str)

  • description (Optional[str])

  • value (Optional[str])

  • createdAt (Optional[int])

property createdAt: None | int

Date when the MLHyperParam was developed

property description: None | str

Documentation of the MLHyperParam

property name: str

Name of the MLHyperParam

property value: None | str

The value of the MLHyperParam

class datahub.metadata.schema_classes.MLMetricClass(name, description=None, value=None, createdAt=None)

Bases: _Aspect

Properties associated with an ML Metric

Parameters:
  • name (str)

  • description (Optional[str])

  • value (Optional[str])

  • createdAt (Optional[int])

property createdAt: None | int

Date when the mlMetric was developed

property description: None | str

Documentation of the mlMetric

property name: str

Name of the mlMetric

property value: None | str

The value of the mlMetric

class datahub.metadata.schema_classes.MLModelDeploymentKeyClass(platform, name, origin)

Bases: _Aspect

Key for an ML model deployment

Parameters:
property name: str

Name of the MLModelDeployment

property origin: str | FabricTypeClass

Fabric type where model Deployment belongs to or where it was generated

property platform: str

Standardized platform urn for the model Deployment

class datahub.metadata.schema_classes.MLModelDeploymentPropertiesClass(customProperties=None, externalUrl=None, description=None, createdAt=None, version=None, status=None)

Bases: _Aspect

Properties associated with an ML Model Deployment

Parameters:
  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • description (Optional[str])

  • createdAt (Optional[int])

  • version (Optional[VersionTagClass])

  • status (Union[None, str, DeploymentStatusClass])

property createdAt: None | int

Date when the MLModelDeployment was developed

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Documentation of the MLModelDeployment

property externalUrl: None | str

URL where the reference exist

property status: None | str | DeploymentStatusClass

Status of the deployment

property version: None | VersionTagClass

Version of the MLModelDeployment

class datahub.metadata.schema_classes.MLModelDeploymentSnapshotClass(urn, aspects)

Bases: DictWrapper

Parameters:
property aspects: List[MLModelDeploymentKeyClass | MLModelDeploymentPropertiesClass | OwnershipClass | StatusClass | DeprecationClass | GlobalTagsClass | DataPlatformInstanceClass]

The list of metadata aspects associated with the MLModelDeployment. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.MLModelFactorPromptsClass(relevantFactors=None, evaluationFactors=None)

Bases: _Aspect

Prompts which affect the performance of the MLModel

Parameters:
property evaluationFactors: None | List[MLModelFactorsClass]

Which factors are being reported, and why were these chosen?

property relevantFactors: None | List[MLModelFactorsClass]

What are foreseeable salient factors for which MLModel performance may vary, and how were these determined?

class datahub.metadata.schema_classes.MLModelFactorsClass(groups=None, instrumentation=None, environment=None)

Bases: DictWrapper

Factors affecting the performance of the MLModel.

Parameters:
  • groups (Optional[List[str]])

  • instrumentation (Optional[List[str]])

  • environment (Optional[List[str]])

property environment: None | List[str]

A further factor affecting MLModel performance is the environment in which it is deployed.

property groups: None | List[str]

Groups refers to distinct categories with similar characteristics that are present in the evaluation data instances. For human-centric machine learning MLModels, groups are people who share one or multiple characteristics.

property instrumentation: None | List[str]

The performance of a MLModel can vary depending on what instruments were used to capture the input to the MLModel. For example, a face detection model may perform differently depending on the camera’s hardware and software, including lens, image stabilization, high dynamic range techniques, and background blurring for portrait mode.

class datahub.metadata.schema_classes.MLModelGroupKeyClass(platform, name, origin)

Bases: _Aspect

Key for an ML model group

Parameters:
property name: str

Name of the MLModelGroup

property origin: str | FabricTypeClass

Fabric type where model group belongs to or where it was generated

property platform: str

Standardized platform urn for the model group

class datahub.metadata.schema_classes.MLModelGroupPropertiesClass(customProperties=None, description=None, createdAt=None, version=None)

Bases: _Aspect

Properties associated with an ML Model Group

Parameters:
  • customProperties (Optional[Dict[str, str]])

  • description (Optional[str])

  • createdAt (Optional[int])

  • version (Optional[VersionTagClass])

property createdAt: None | int

Date when the MLModelGroup was developed

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Documentation of the MLModelGroup

property version: None | VersionTagClass

Version of the MLModelGroup

class datahub.metadata.schema_classes.MLModelGroupSnapshotClass(urn, aspects)

Bases: DictWrapper

Parameters:
property aspects: List[MLModelGroupKeyClass | MLModelGroupPropertiesClass | OwnershipClass | StatusClass | DeprecationClass | BrowsePathsClass | GlobalTagsClass | DataPlatformInstanceClass | BrowsePathsV2Class]

The list of metadata aspects associated with the MLModelGroup. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.MLModelKeyClass(platform, name, origin)

Bases: _Aspect

Key for an ML model

Parameters:
property name: str

Name of the MLModel

property origin: str | FabricTypeClass

Fabric type where model belongs to or where it was generated

property platform: str

Standardized platform urn for the model

class datahub.metadata.schema_classes.MLModelPropertiesClass(customProperties=None, externalUrl=None, description=None, date=None, version=None, type=None, hyperParameters=None, hyperParams=None, trainingMetrics=None, onlineMetrics=None, mlFeatures=None, tags=None, deployments=None, trainingJobs=None, downstreamJobs=None, groups=None)

Bases: _Aspect

Properties associated with a ML Model

Parameters:
  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • description (Optional[str])

  • date (Optional[int])

  • version (Optional[VersionTagClass])

  • type (Optional[str])

  • hyperParameters (Optional[Dict[str, Union[str, int, float, bool]]])

  • hyperParams (Optional[List[MLHyperParamClass]])

  • trainingMetrics (Optional[List[MLMetricClass]])

  • onlineMetrics (Optional[List[MLMetricClass]])

  • mlFeatures (Optional[List[str]])

  • tags (Optional[List[str]])

  • deployments (Optional[List[str]])

  • trainingJobs (Optional[List[str]])

  • downstreamJobs (Optional[List[str]])

  • groups (Optional[List[str]])

property customProperties: Dict[str, str]

Custom property bag.

property date: None | int

Date when the MLModel was developed

property deployments: None | List[str]

Deployments for the MLModel

property description: None | str

Documentation of the MLModel

property downstreamJobs: None | List[str]

List of jobs (if any) that use the model

property externalUrl: None | str

URL where the reference exist

property groups: None | List[str]

Groups the model belongs to

property hyperParameters: None | Dict[str, str | int | float | bool]

Hyper Parameters of the MLModel

NOTE: these are deprecated in favor of hyperParams

property hyperParams: None | List[MLHyperParamClass]

Hyperparameters of the MLModel

property mlFeatures: None | List[str]

List of features used for MLModel training

property onlineMetrics: None | List[MLMetricClass]

Metrics of the MLModel used in production

property tags: List[str]

Tags for the MLModel

property trainingJobs: None | List[str]

List of jobs (if any) used to train the model

property trainingMetrics: None | List[MLMetricClass]

Metrics of the MLModel used in training

property type: None | str

Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc

property version: None | VersionTagClass

Version of the MLModel

class datahub.metadata.schema_classes.MLModelSnapshotClass(urn, aspects)

Bases: DictWrapper

MLModel Snapshot entity details.

Parameters:
property aspects: List[MLModelKeyClass | OwnershipClass | MLModelPropertiesClass | IntendedUseClass | MLModelFactorPromptsClass | MetricsClass | EvaluationDataClass | TrainingDataClass | QuantitativeAnalysesClass | EthicalConsiderationsClass | CaveatsAndRecommendationsClass | InstitutionalMemoryClass | SourceCodeClass | StatusClass | CostClass | DeprecationClass | BrowsePathsClass | GlobalTagsClass | DataPlatformInstanceClass | BrowsePathsV2Class]

The list of metadata aspects associated with the MLModel. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.MLPrimaryKeyKeyClass(featureNamespace, name)

Bases: _Aspect

Key for an MLPrimaryKey

Parameters:
  • featureNamespace (str)

  • name (str)

property featureNamespace: str

Namespace for the primary key

property name: str

Name of the primary key

class datahub.metadata.schema_classes.MLPrimaryKeyPropertiesClass(sources, description=None, dataType=None, version=None)

Bases: _Aspect

Properties associated with a MLPrimaryKey

Parameters:
property dataType: None | str | MLFeatureDataTypeClass

Data Type of the MLPrimaryKey

property description: None | str

Documentation of the MLPrimaryKey

property sources: List[str]

Source of the MLPrimaryKey

property version: None | VersionTagClass

Version of the MLPrimaryKey

class datahub.metadata.schema_classes.MLPrimaryKeySnapshotClass(urn, aspects)

Bases: DictWrapper

Parameters:
property aspects: List[MLPrimaryKeyKeyClass | MLPrimaryKeyPropertiesClass | OwnershipClass | InstitutionalMemoryClass | StatusClass | DeprecationClass | GlobalTagsClass | DataPlatformInstanceClass]

The list of metadata aspects associated with the MLPrimaryKey. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.MapTypeClass(keyType=None, valueType=None)

Bases: DictWrapper

Map field type.

Parameters:
  • keyType (Optional[str])

  • valueType (Optional[str])

property keyType: None | str

Key type in a map

property valueType: None | str

Type of the value in a map

class datahub.metadata.schema_classes.MediaClass(type, location)

Bases: DictWrapper

Carries information about which roles a user is assigned to.

Parameters:
property location: str

Where the media content is stored.

property type: str | MediaTypeClass

Type of content the Media is storing, e.g. image, video, etc.

class datahub.metadata.schema_classes.MediaTypeClass

Bases: object

Enum defining the type of content a Media object holds.

IMAGE = 'IMAGE'
class datahub.metadata.schema_classes.MetadataChangeEventClass(proposedSnapshot, auditHeader=None, proposedDelta=None, systemMetadata=None)

Bases: DictWrapper

Kafka event for proposing a metadata change for an entity. A corresponding MetadataAuditEvent is emitted when the change is accepted and committed, otherwise a FailedMetadataChangeEvent will be emitted instead.

Parameters:
property auditHeader: None | KafkaAuditHeaderClass

Kafka audit header. See go/kafkaauditheader for more info.

property proposedDelta: None

Delta of the proposed metadata partial update.

property proposedSnapshot: ChartSnapshotClass | CorpGroupSnapshotClass | CorpUserSnapshotClass | DashboardSnapshotClass | DataFlowSnapshotClass | DataJobSnapshotClass | DatasetSnapshotClass | DataProcessSnapshotClass | DataPlatformSnapshotClass | MLModelSnapshotClass | MLPrimaryKeySnapshotClass | MLFeatureSnapshotClass | MLFeatureTableSnapshotClass | MLModelDeploymentSnapshotClass | MLModelGroupSnapshotClass | TagSnapshotClass | GlossaryTermSnapshotClass | GlossaryNodeSnapshotClass | DataHubPolicySnapshotClass | SchemaFieldSnapshotClass | DataHubRetentionSnapshotClass

Snapshot of the proposed metadata change. Include only the aspects affected by the change in the snapshot.

property systemMetadata: None | SystemMetadataClass

Metadata around how the snapshot was ingested

class datahub.metadata.schema_classes.MetadataChangeLogClass(entityType, changeType, auditHeader=None, entityUrn=None, entityKeyAspect=None, aspectName=None, aspect=None, systemMetadata=None, previousAspectValue=None, previousSystemMetadata=None, created=None)

Bases: DictWrapper

Kafka event for capturing update made to an entity’s metadata.

Parameters:
property aspect: None | GenericAspectClass

The value of the new aspect.

property aspectName: None | str

Aspect of the entity being written to Not filling this out implies that the writer wants to affect the entire entity Note: This is only valid for CREATE, UPSERT, and DELETE operations.

property auditHeader: None | KafkaAuditHeaderClass

Kafka audit header. Currently remains unused in the open source.

property changeType: str | ChangeTypeClass

Type of change being proposed

property created: None | AuditStampClass

An audit stamp detailing who and when the aspect was changed by. Required for all intents and purposes.

property entityKeyAspect: None | GenericAspectClass

Key aspect of the entity being written

property entityType: str

Type of the entity being written to

property entityUrn: None | str

Urn of the entity being written

property previousAspectValue: None | GenericAspectClass

The previous value of the aspect that has changed.

property previousSystemMetadata: None | SystemMetadataClass

The previous value of the system metadata field that has changed.

property systemMetadata: None | SystemMetadataClass

A string->string map of custom properties that one might want to attach to an event

class datahub.metadata.schema_classes.MetadataChangeProposalClass(entityType, changeType, auditHeader=None, entityUrn=None, entityKeyAspect=None, aspectName=None, aspect=None, systemMetadata=None)

Bases: DictWrapper

Kafka event for proposing a metadata change for an entity. A corresponding MetadataChangeLog is emitted when the change is accepted and committed, otherwise a FailedMetadataChangeProposal will be emitted instead.

Parameters:
property aspect: None | GenericAspectClass

The value of the new aspect.

property aspectName: None | str

Aspect of the entity being written to Not filling this out implies that the writer wants to affect the entire entity Note: This is only valid for CREATE, UPSERT, and DELETE operations.

property auditHeader: None | KafkaAuditHeaderClass

Kafka audit header. Currently remains unused in the open source.

property changeType: str | ChangeTypeClass

Type of change being proposed

property entityKeyAspect: None | GenericAspectClass

Key aspect of the entity being written

property entityType: str

Type of the entity being written to

property entityUrn: None | str

Urn of the entity being written

property systemMetadata: None | SystemMetadataClass

A string->string map of custom properties that one might want to attach to an event

class datahub.metadata.schema_classes.MetricsClass(performanceMeasures=None, decisionThreshold=None)

Bases: _Aspect

Metrics to be featured for the MLModel.

Parameters:
  • performanceMeasures (Optional[List[str]])

  • decisionThreshold (Optional[List[str]])

property decisionThreshold: None | List[str]

Decision Thresholds used (if any)?

property performanceMeasures: None | List[str]

Measures of MLModel performance

class datahub.metadata.schema_classes.MySqlDDLClass(tableSchema)

Bases: DictWrapper

Schema holder for MySql data definition language that describes an MySql table.

Parameters:

tableSchema (str)

property tableSchema: str

The native schema in the dataset’s platform. This is a human readable (json blob) table schema.

class datahub.metadata.schema_classes.NativeGroupMembershipClass(nativeGroups)

Bases: _Aspect

Carries information about the native CorpGroups a user is in.

Parameters:

nativeGroups (List[str])

property nativeGroups: List[str]
class datahub.metadata.schema_classes.NotebookCellClass(type, textCell=None, queryCell=None, chartCell=None)

Bases: DictWrapper

A record of all supported cells for a Notebook. Only one type of cell will be non-null.

Parameters:
property chartCell: None | ChartCellClass

The chart cell content. The will be non-null only when all other cell field is null.

property queryCell: None | QueryCellClass

The query cell content. The will be non-null only when all other cell field is null.

property textCell: None | TextCellClass

The text cell content. The will be non-null only when all other cell field is null.

property type: str | NotebookCellTypeClass

The type of this Notebook cell

class datahub.metadata.schema_classes.NotebookCellTypeClass

Bases: object

Type of Notebook Cell

CHART_CELL = 'CHART_CELL'
QUERY_CELL = 'QUERY_CELL'

CHART Notebook cell type. The cell content is chart only.

TEXT_CELL = 'TEXT_CELL'

QUERY Notebook cell type. The cell context is query only.

class datahub.metadata.schema_classes.NotebookContentClass(cells=None)

Bases: _Aspect

Content in a Notebook Note: This is IN BETA version

Parameters:

cells (Optional[List[NotebookCellClass]])

property cells: List[NotebookCellClass]

The content of a Notebook which is composed by a list of NotebookCell

class datahub.metadata.schema_classes.NotebookInfoClass(title, changeAuditStamps, customProperties=None, externalUrl=None, description=None)

Bases: _Aspect

Information about a Notebook Note: This is IN BETA version

Parameters:
  • title (str)

  • changeAuditStamps (ChangeAuditStampsClass)

  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

  • description (Optional[str])

property changeAuditStamps: ChangeAuditStampsClass

Captures information about who created/last modified/deleted this Notebook and when

property customProperties: Dict[str, str]

Custom property bag.

property description: None | str

Detailed description about the Notebook

property externalUrl: None | str

URL where the reference exist

property title: str

Title of the Notebook

class datahub.metadata.schema_classes.NotebookKeyClass(notebookTool, notebookId)

Bases: _Aspect

Key for a Notebook

Parameters:
  • notebookTool (str)

  • notebookId (str)

property notebookId: str

Unique id for the Notebook. This id should be globally unique for a Notebook tool even when there are multiple deployments of it. As an example, Notebook URL could be used here for QueryBook such as ‘querybook.com/notebook/773’

property notebookTool: str

The name of the Notebook tool such as QueryBook, etc.

class datahub.metadata.schema_classes.NullTypeClass

Bases: DictWrapper

Null field type.

class datahub.metadata.schema_classes.NumberTypeClass

Bases: DictWrapper

Number data type: long, integer, short, etc..

class datahub.metadata.schema_classes.OperationClass(timestampMillis, operationType, lastUpdatedTimestamp, eventGranularity=None, partitionSpec=None, messageId=None, actor=None, customOperationType=None, numAffectedRows=None, affectedDatasets=None, sourceType=None, customProperties=None)

Bases: _Aspect

Operational info for an entity.

Parameters:
  • timestampMillis (int)

  • operationType (Union[str, OperationTypeClass])

  • lastUpdatedTimestamp (int)

  • eventGranularity (Optional[TimeWindowSizeClass])

  • partitionSpec (Optional[PartitionSpecClass])

  • messageId (Optional[str])

  • actor (Optional[str])

  • customOperationType (Optional[str])

  • numAffectedRows (Optional[int])

  • affectedDatasets (Optional[List[str]])

  • sourceType (Union[None, str, OperationSourceTypeClass])

  • customProperties (Optional[Dict[str, str]])

ASPECT_TYPE: ClassVar[str] = 'timeseries'
property actor: None | str

Actor who issued this operation.

property affectedDatasets: None | List[str]

Which other datasets were affected by this operation.

property customOperationType: None | str

A custom type of operation. Required if operationType is CUSTOM.

property customProperties: None | Dict[str, str]

Custom properties

property eventGranularity: None | TimeWindowSizeClass

Granularity of the event if applicable

property lastUpdatedTimestamp: int

The time at which the operation occurred. Would be better named ‘operationTime’

property messageId: None | str

The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.

property numAffectedRows: None | int

How many rows were affected by this operation.

property operationType: str | OperationTypeClass

Operation type of change.

property partitionSpec: PartitionSpecClass | None

The optional partition specification.

property sourceType: None | str | OperationSourceTypeClass

Source Type

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

class datahub.metadata.schema_classes.OperationSourceTypeClass

Bases: object

The source of an operation

DATA_PLATFORM = 'DATA_PLATFORM'
DATA_PROCESS = 'DATA_PROCESS'

Rows were updated

class datahub.metadata.schema_classes.OperationTypeClass

Bases: object

Enum to define the operation type when an entity changes.

ALTER = 'ALTER'

Asset was dropped

CREATE = 'CREATE'

Asset was altered

CUSTOM = 'CUSTOM'
DELETE = 'DELETE'

Asset was created

DROP = 'DROP'

Custom asset operation

INSERT = 'INSERT'

Rows were updated

UNKNOWN = 'UNKNOWN'
UPDATE = 'UPDATE'

Rows were deleted

class datahub.metadata.schema_classes.OracleDDLClass(tableSchema)

Bases: DictWrapper

Schema holder for oracle data definition language that describes an oracle table.

Parameters:

tableSchema (str)

property tableSchema: str

The native schema in the dataset’s platform. This is a human readable (json blob) table schema.

class datahub.metadata.schema_classes.OrcSchemaClass(schema)

Bases: DictWrapper

Schema text of an ORC schema.

Parameters:

schema (str)

property schema: str

The native schema for ORC file format.

class datahub.metadata.schema_classes.OriginClass(type, externalType=None)

Bases: _Aspect

Carries information about where an entity originated from.

Parameters:
property externalType: None | str

Only populated if type is EXTERNAL. The externalType of the entity, such as the name of the identity provider.

property type: str | OriginTypeClass

Where an entity originated from. Either NATIVE or EXTERNAL.

class datahub.metadata.schema_classes.OriginTypeClass

Bases: object

Enum to define where an entity originated from.

EXTERNAL = 'EXTERNAL'
NATIVE = 'NATIVE'

The entity is external to DataHub.

class datahub.metadata.schema_classes.OtherSchemaClass(rawSchema)

Bases: DictWrapper

Schema holder for undefined schema types.

Parameters:

rawSchema (str)

property rawSchema: str

The native schema in the dataset’s platform.

class datahub.metadata.schema_classes.OwnerClass(owner, type, typeUrn=None, source=None)

Bases: DictWrapper

Ownership information

Parameters:
property owner: str

corpuser:ldap, urn:li:corpGroup:group_name, and urn:li:multiProduct:mp_name (Caveat: only corpuser is currently supported in the frontend.)

Type:

Owner URN, e.g. urn

Type:

li

property source: None | OwnershipSourceClass

Source information for the ownership

property type: str | OwnershipTypeClass

The type of the ownership

property typeUrn: None | str

The type of the ownership Urn of type O

class datahub.metadata.schema_classes.OwnershipClass(owners, lastModified=None)

Bases: _Aspect

Ownership information of an entity.

Parameters:
property lastModified: AuditStampClass

Audit stamp containing who last modified the record and when. A value of 0 in the time field indicates missing data.

property owners: List[OwnerClass]

List of owners of the entity.

class datahub.metadata.schema_classes.OwnershipSourceClass(type, url=None)

Bases: DictWrapper

Source/provider of the ownership information

Parameters:
property type: str | OwnershipSourceTypeClass

The type of the source

property url: None | str

A reference URL for the source

class datahub.metadata.schema_classes.OwnershipSourceTypeClass

Bases: object

Auditing system or audit logs

AUDIT = 'AUDIT'

Database, e.g. GRANTS table

DATABASE = 'DATABASE'

File system, e.g. file/directory owner

FILE_SYSTEM = 'FILE_SYSTEM'

Issue tracking system, e.g. Jira

ISSUE_TRACKING_SYSTEM = 'ISSUE_TRACKING_SYSTEM'

Manually provided by a user

MANUAL = 'MANUAL'

Other ownership-like service, e.g. Nuage, ACL service etc

OTHER = 'OTHER'
SERVICE = 'SERVICE'

SCM system, e.g. GIT, SVN

SOURCE_CONTROL = 'SOURCE_CONTROL'

Other sources

class datahub.metadata.schema_classes.OwnershipTypeClass

Bases: object

Asset owner types

BUSINESS_OWNER = 'BUSINESS_OWNER'

A steward, expert, or delegate responsible for the asset.

CONSUMER = 'CONSUMER'

A person or a group that has direct business interest Deprecated! Use TECHNICAL_OWNER, BUSINESS_OWNER, or STEWARD instead.

CUSTOM = 'CUSTOM'

person or group who is responsible for technical aspects of the asset.

DATAOWNER = 'DATAOWNER'

A person or a group that overseas the operation, e.g. a DBA or SRE. Deprecated! Use TECHNICAL_OWNER instead.

DATA_STEWARD = 'DATA_STEWARD'

No specific type associated to the owner.

DELEGATE = 'DELEGATE'

A person, group, or service that produces/generates the data Deprecated! Use TECHNICAL_OWNER instead.

DEVELOPER = 'DEVELOPER'

A person or group that is owning the data Deprecated! Use TECHNICAL_OWNER instead.

NONE = 'NONE'

A person or group that is in charge of developing the code Deprecated! Use TECHNICAL_OWNER instead.

PRODUCER = 'PRODUCER'

A person, group, or service that consumes the data Deprecated! Use TECHNICAL_OWNER or BUSINESS_OWNER instead.

STAKEHOLDER = 'STAKEHOLDER'
TECHNICAL_OWNER = 'TECHNICAL_OWNER'

A person or group who is responsible for logical, or business related, aspects of the asset.

class datahub.metadata.schema_classes.OwnershipTypeInfoClass(name, created, lastModified, description=None)

Bases: _Aspect

Information about an ownership type

Parameters:
property created: AuditStampClass

Audit stamp capturing the time and actor who created the Ownership Type.

property description: None | str

Description of the Ownership Type

property lastModified: AuditStampClass

Audit stamp capturing the time and actor who last modified the Ownership Type.

property name: str

Display name of the Ownership Type

class datahub.metadata.schema_classes.OwnershipTypeKeyClass(id)

Bases: _Aspect

Key for a Ownership Type

Parameters:

id (str)

property id: str

Unique ID for the data ownership type name i.e. Business Owner, Data Steward, Technical Owner, etc.. Should be separate from the name used for displaying an Ownership Type.

class datahub.metadata.schema_classes.ParametersClass

Bases: DictWrapper

Arbitrary key-value parameters for an Entity Change Event. (any record).

class datahub.metadata.schema_classes.PartitionSpecClass(partition, type=None, timePartition=None)

Bases: DictWrapper

Defines how the data is partitioned

Parameters:
property partition: str

String representation of the partition

property timePartition: None | TimeWindowClass

Time window of the partition if applicable

property type: str | PartitionTypeClass
class datahub.metadata.schema_classes.PartitionTypeClass

Bases: object

FULL_TABLE = 'FULL_TABLE'
PARTITION = 'PARTITION'
QUERY = 'QUERY'
class datahub.metadata.schema_classes.PlatformEventClass(header, name, payload)

Bases: DictWrapper

A DataHub Platform Event.

Parameters:
property header: PlatformEventHeaderClass

Header information stored with the event.

property name: str

The name of the event, e.g. the type of event. For example, ‘notificationRequestEvent’, ‘entityChangeEvent’

property payload: GenericPayloadClass

The event payload.

class datahub.metadata.schema_classes.PlatformEventHeaderClass(timestampMillis)

Bases: DictWrapper

A header included with each DataHub platform event.

Parameters:

timestampMillis (int)

property timestampMillis: int

The event timestamp field as epoch at UTC in milli seconds.

class datahub.metadata.schema_classes.PlatformTypeClass

Bases: object

Platform types available at LinkedIn

FILE_SYSTEM = 'FILE_SYSTEM'

Value for a key value store, e.g. espresso, voldemort

KEY_VALUE_STORE = 'KEY_VALUE_STORE'

Value for a message broker, e.g. kafka

MESSAGE_BROKER = 'MESSAGE_BROKER'

Value for an object store, e.g. ambry

OBJECT_STORE = 'OBJECT_STORE'

Value for an OLAP datastore, e.g. pinot

OLAP_DATASTORE = 'OLAP_DATASTORE'

Value for other platforms, e.g salesforce, dovetail

OTHERS = 'OTHERS'

Value for a query engine, e.g. presto

QUERY_ENGINE = 'QUERY_ENGINE'

Value for a relational database, e.g. oracle, mysql

RELATIONAL_DB = 'RELATIONAL_DB'

Value for a search engine, e.g seas

SEARCH_ENGINE = 'SEARCH_ENGINE'
class datahub.metadata.schema_classes.PolicyMatchConditionClass

Bases: object

The matching condition in a filter criterion

EQUALS = 'EQUALS'
class datahub.metadata.schema_classes.PolicyMatchCriterionClass(field, values, condition=None)

Bases: DictWrapper

A criterion for matching a field with given value

Parameters:
property condition: str | PolicyMatchConditionClass

The condition for the criterion

property field: str

The name of the field that the criterion refers to

property values: List[str]

Values. Matches criterion if any one of the values matches condition (OR-relationship)

class datahub.metadata.schema_classes.PolicyMatchFilterClass(criteria)

Bases: DictWrapper

The filter for specifying the resource or actor to apply privileges to

Parameters:

criteria (List[PolicyMatchCriterionClass])

property criteria: List[PolicyMatchCriterionClass]

A list of criteria to apply conjunctively (so all criteria must pass)

class datahub.metadata.schema_classes.PostContentClass(title, type, description=None, link=None, media=None)

Bases: DictWrapper

Content stored inside a Post.

Parameters:
property description: None | str

Optional description of the post.

Optional link that the post is associated with.

property media: None | MediaClass

Optional media that the post is storing

property title: str

Title of the post.

property type: str | PostContentTypeClass

Type of content held in the post.

class datahub.metadata.schema_classes.PostContentTypeClass

Bases: object

Enum defining the type of content held in a Post.

TEXT = 'TEXT'

Link content

class datahub.metadata.schema_classes.PostInfoClass(type, content, created, lastModified)

Bases: _Aspect

Information about a DataHub Post.

Parameters:
property content: PostContentClass

Content stored in the post.

property created: int

The time at which the post was initially created

property lastModified: int

The time at which the post was last modified

property type: str | PostTypeClass

Type of the Post.

class datahub.metadata.schema_classes.PostKeyClass(id)

Bases: _Aspect

Key for a Post.

Parameters:

id (str)

property id: str

A unique id for the DataHub Post record. Generated on the server side at Post creation time.

class datahub.metadata.schema_classes.PostTypeClass

Bases: object

Enum defining types of Posts.

HOME_PAGE_ANNOUNCEMENT = 'HOME_PAGE_ANNOUNCEMENT'
class datahub.metadata.schema_classes.PrestoDDLClass(rawSchema)

Bases: DictWrapper

Schema holder for presto data definition language that describes a presto view.

Parameters:

rawSchema (str)

property rawSchema: str

The raw schema in the dataset’s platform. This includes the DDL and the columns extracted from DDL.

class datahub.metadata.schema_classes.QuantileClass(quantile, value)

Bases: DictWrapper

Parameters:
  • quantile (str)

  • value (str)

property quantile: str
property value: str
class datahub.metadata.schema_classes.QuantitativeAnalysesClass(unitaryResults=None, intersectionalResults=None)

Bases: _Aspect

Quantitative analyses should be disaggregated, that is, broken down by the chosen factors. Quantitative analyses should provide the results of evaluating the MLModel according to the chosen metrics, providing confidence interval values when possible.

Parameters:
  • unitaryResults (Optional[str])

  • intersectionalResults (Optional[str])

property intersectionalResults: None | str

Link to a dashboard with results showing how the MLModel performed with respect to the intersection of evaluated factors?

property unitaryResults: None | str

Link to a dashboard with results showing how the MLModel performed with respect to each factor

class datahub.metadata.schema_classes.QueryCellClass(cellId, changeAuditStamps, rawQuery, cellTitle=None, lastExecuted=None)

Bases: DictWrapper

Query cell in a Notebook, which will present content in query format

Parameters:
property cellId: str

Unique id for the cell. This id should be globally unique for a Notebook tool even when there are multiple deployments of it. As an example, Notebook URL could be used here for QueryBook such as ‘querybook.com/notebook/773/?cellId=1234’

property cellTitle: None | str

Title of the cell

property changeAuditStamps: ChangeAuditStampsClass

Captures information about who created/last modified/deleted this Notebook cell and when

property lastExecuted: None | AuditStampClass

Captures information about who last executed this query cell and when

property rawQuery: str

Raw query to explain some specific logic in a Notebook

class datahub.metadata.schema_classes.QueryKeyClass(id)

Bases: _Aspect

Key for a Query

Parameters:

id (str)

property id: str

A unique id for the Query.

class datahub.metadata.schema_classes.QueryLanguageClass

Bases: object

A SQL Query

SQL = 'SQL'
class datahub.metadata.schema_classes.QueryPropertiesClass(statement, source, created, lastModified, name=None, description=None)

Bases: _Aspect

Information about a Query against one or more data assets (e.g. Tables or Views).

Parameters:
property created: AuditStampClass

Audit stamp capturing the time and actor who created the Query.

property description: None | str

The Query description.

property lastModified: AuditStampClass

Audit stamp capturing the time and actor who last modified the Query.

property name: None | str

Optional display name to identify the query.

property source: str | QuerySourceClass

The source of the Query

property statement: QueryStatementClass

The Query Statement.

class datahub.metadata.schema_classes.QuerySourceClass

Bases: object

The query was entered manually by a user (via the UI).

MANUAL = 'MANUAL'
class datahub.metadata.schema_classes.QueryStatementClass(value, language=None)

Bases: DictWrapper

A query statement against one or more data assets.

Parameters:
property language: str | QueryLanguageClass

The language of the Query, e.g. SQL.

property value: str

The query text

class datahub.metadata.schema_classes.QuerySubjectClass(entity)

Bases: DictWrapper

A single subject of a particular query. In the future, we may evolve this model to include richer details about the Query Subject in relation to the query.

Parameters:

entity (str)

property entity: str

An entity which is the subject of a query.

class datahub.metadata.schema_classes.QuerySubjectsClass(subjects)

Bases: _Aspect

Information about the subjects of a particular Query, i.e. the assets being queried.

Parameters:

subjects (List[QuerySubjectClass])

property subjects: List[QuerySubjectClass]

One or more subjects of the query.

In single-asset queries (e.g. table select), this will contain the Table reference and optionally schema field references.

In multi-asset queries (e.g. table joins), this may contain multiple Table references and optionally schema field references.

class datahub.metadata.schema_classes.RecordTypeClass

Bases: DictWrapper

Record field type.

class datahub.metadata.schema_classes.RetentionClass(version=None, time=None)

Bases: DictWrapper

Base class that encapsulates different retention policies. Only one of the fields should be set

Parameters:
property time: None | TimeBasedRetentionClass
property version: None | VersionBasedRetentionClass
class datahub.metadata.schema_classes.RoleAssociationClass(urn)

Bases: DictWrapper

Properties of an applied Role. For now, just an Urn

Parameters:

urn (str)

property urn: str

Urn of the External Role

class datahub.metadata.schema_classes.RoleKeyClass(id)

Bases: _Aspect

Key for a External AccessManagement

Parameters:

id (str)

property id: str

A unique id for the access management IAM.

class datahub.metadata.schema_classes.RoleMembershipClass(roles)

Bases: _Aspect

Carries information about which roles a user is assigned to.

Parameters:

roles (List[str])

property roles: List[str]
class datahub.metadata.schema_classes.RolePropertiesClass(name, type, description=None, requestUrl=None, created=None)

Bases: _Aspect

Information about a ExternalRoleProperties

Parameters:
  • name (str)

  • type (str)

  • description (Optional[str])

  • requestUrl (Optional[str])

  • created (Optional[AuditStampClass])

property created: None | AuditStampClass

Created Audit stamp

property description: None | str

Description of the IAM Role

property name: str

Display name of the IAM Role in the external system

property requestUrl: None | str

Link to access external access management

property type: str

Can be READ, ADMIN, WRITE

class datahub.metadata.schema_classes.RoleUserClass(user)

Bases: DictWrapper

Provisioned users of a role

Parameters:

user (str)

property user: str

Link provisioned corp user for a role

class datahub.metadata.schema_classes.RunResultTypeClass

Bases: object

The Run Succeeded

FAILURE = 'FAILURE'

The Run Skipped

SKIPPED = 'SKIPPED'

The Run Failed and will Retry

SUCCESS = 'SUCCESS'

The Run Failed

UP_FOR_RETRY = 'UP_FOR_RETRY'
class datahub.metadata.schema_classes.SchemaFieldClass(fieldPath, type, nativeDataType, jsonPath=None, nullable=None, description=None, label=None, created=None, lastModified=None, recursive=None, globalTags=None, glossaryTerms=None, isPartOfKey=None, isPartitioningKey=None, jsonProps=None)

Bases: DictWrapper

SchemaField to describe metadata related to dataset schema.

Parameters:
  • fieldPath (str)

  • type (SchemaFieldDataTypeClass)

  • nativeDataType (str)

  • jsonPath (Optional[str])

  • nullable (Optional[bool])

  • description (Optional[str])

  • label (Optional[str])

  • created (Optional[AuditStampClass])

  • lastModified (Optional[AuditStampClass])

  • recursive (Optional[bool])

  • globalTags (Optional[GlobalTagsClass])

  • glossaryTerms (Optional[GlossaryTermsClass])

  • isPartOfKey (Optional[bool])

  • isPartitioningKey (Optional[bool])

  • jsonProps (Optional[str])

property created: None | AuditStampClass

An AuditStamp corresponding to the creation of this schema field.

property description: None | str

Description

property fieldPath: str

Flattened name of the field. Field is computed from jsonPath field.

property globalTags: None | GlobalTagsClass

Tags associated with the field

property glossaryTerms: None | GlossaryTermsClass

Glossary terms associated with the field

property isPartOfKey: bool

For schema fields that are part of complex keys, set this field to true We do this to easily distinguish between value and key fields

property isPartitioningKey: None | bool

For Datasets which are partitioned, this determines the partitioning key.

property jsonPath: None | str

Flattened name of a field in JSON Path notation.

property jsonProps: None | str

For schema fields that have other properties that are not modeled explicitly, use this field to serialize those properties into a JSON string

property label: None | str

Label of the field. Provides a more human-readable name for the field than field path. Some sources will provide this metadata but not all sources have the concept of a label. If just one string is associated with a field in a source, that is most likely a description.

property lastModified: None | AuditStampClass

An AuditStamp corresponding to the last modification of this schema field.

property nativeDataType: str

The native type of the field in the dataset’s platform as declared by platform schema.

property nullable: bool

Indicates if this field is optional or nullable

property recursive: bool

There are use cases when a field in type B references type A. A field in A references field of type B. In such cases, we will mark the first field as recursive.

property type: SchemaFieldDataTypeClass

Platform independent field type of the field.

class datahub.metadata.schema_classes.SchemaFieldDataTypeClass(type)

Bases: DictWrapper

Schema field data types

Parameters:

type (Union[BooleanTypeClass, FixedTypeClass, StringTypeClass, BytesTypeClass, NumberTypeClass, DateTypeClass, TimeTypeClass, EnumTypeClass, NullTypeClass, MapTypeClass, ArrayTypeClass, UnionTypeClass, RecordTypeClass])

property type: BooleanTypeClass | FixedTypeClass | StringTypeClass | BytesTypeClass | NumberTypeClass | DateTypeClass | TimeTypeClass | EnumTypeClass | NullTypeClass | MapTypeClass | ArrayTypeClass | UnionTypeClass | RecordTypeClass

Data platform specific types

class datahub.metadata.schema_classes.SchemaFieldKeyClass(parent, fieldPath)

Bases: _Aspect

Key for a SchemaField

Parameters:
  • parent (str)

  • fieldPath (str)

property fieldPath: str

fieldPath identifying the schema field

property parent: str

Parent associated with the schema field

class datahub.metadata.schema_classes.SchemaFieldSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific schema field entity.

Parameters:
property aspects: List[SchemaFieldKeyClass]

The list of metadata aspects associated with the dataset. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.SchemaMetadataClass(schemaName, platform, version, hash, platformSchema, fields, created=None, lastModified=None, deleted=None, dataset=None, cluster=None, primaryKeys=None, foreignKeysSpecs=None, foreignKeys=None)

Bases: _Aspect

SchemaMetadata to describe metadata related to store schema

Parameters:
property cluster: None | str

The cluster this schema metadata resides from

property created: AuditStampClass

An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data.

property dataset: None | str

Dataset this schema metadata is associated with.

property deleted: None | AuditStampClass

An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics.

property fields: List[SchemaFieldClass]

Client provided a list of fields from document schema.

property foreignKeys: None | List[ForeignKeyConstraintClass]

List of foreign key constraints for the schema

property foreignKeysSpecs: None | Dict[str, ForeignKeySpecClass]

Map captures all the references schema makes to external datasets. Map key is ForeignKeySpecName typeref.

property hash: str

the SHA1 hash of the schema content

property lastModified: AuditStampClass

An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data.

property platform: str

platform:{platform_name})

Type:

Standardized platform urn where schema is defined. The data platform Urn (urn

Type:

li

property platformSchema: EspressoSchemaClass | OracleDDLClass | MySqlDDLClass | PrestoDDLClass | KafkaSchemaClass | BinaryJsonSchemaClass | OrcSchemaClass | SchemalessClass | KeyValueSchemaClass | OtherSchemaClass

The native schema in the dataset’s platform.

property primaryKeys: None | List[str]

Client provided list of fields that define primary keys to access record. Field order defines hierarchical espresso keys. Empty lists indicates absence of primary key access patter. Value is a SchemaField@fieldPath.

property schemaName: str

Schema name e.g. PageViewEvent, identity.Profile, ams.account_management_tracking

property version: int

Every change to SchemaMetadata in the resource results in a new version. Version is server assigned. This version is differ from platform native schema version.

class datahub.metadata.schema_classes.SchemalessClass

Bases: DictWrapper

The dataset has no specific schema associated with it

class datahub.metadata.schema_classes.SiblingsClass(siblings, primary)

Bases: _Aspect

Siblings information of an entity.

Parameters:
  • siblings (List[str])

  • primary (bool)

property primary: bool

If this is the leader entity of the set of siblings

property siblings: List[str]

List of sibling entities

class datahub.metadata.schema_classes.SourceCodeClass(sourceCode)

Bases: _Aspect

Source Code

Parameters:

sourceCode (List[SourceCodeUrlClass])

property sourceCode: List[SourceCodeUrlClass]

Source Code along with types

class datahub.metadata.schema_classes.SourceCodeUrlClass(type, sourceCodeUrl)

Bases: DictWrapper

Source Code Url Entity

Parameters:
property sourceCodeUrl: str

Source Code Url

property type: str | SourceCodeUrlTypeClass

Source Code Url Types

class datahub.metadata.schema_classes.SourceCodeUrlTypeClass

Bases: object

EVALUATION_PIPELINE_SOURCE_CODE = 'EVALUATION_PIPELINE_SOURCE_CODE'
ML_MODEL_SOURCE_CODE = 'ML_MODEL_SOURCE_CODE'
TRAINING_PIPELINE_SOURCE_CODE = 'TRAINING_PIPELINE_SOURCE_CODE'
class datahub.metadata.schema_classes.StatusClass(removed=None)

Bases: _Aspect

The lifecycle status metadata of an entity, e.g. dataset, metric, feature, etc. This aspect is used to represent soft deletes conventionally.

Parameters:

removed (Optional[bool])

property removed: bool

Whether the entity has been removed (soft-deleted).

class datahub.metadata.schema_classes.StringTypeClass

Bases: DictWrapper

String field type.

class datahub.metadata.schema_classes.StructuredExecutionReportClass(type, serializedValue, contentType)

Bases: DictWrapper

A flexible carrier for structured results of an execution request. The goal is to allow for free flow of structured responses from execution tasks to the orchestrator or observer. The full spectrum of different execution report types is not intended to be modeled by this object.

Parameters:
  • type (str)

  • serializedValue (str)

  • contentType (str)

property contentType: str

The content-type of the serialized value (e.g. application/json, application/json;gzip etc.)

property serializedValue: str

The serialized value of the structured report

property type: str

The type of the structured report. (e.g. INGESTION_REPORT, TEST_CONNECTION_REPORT, etc.)

class datahub.metadata.schema_classes.SubTypesClass(typeNames)

Bases: _Aspect

Sub Types. Use this aspect to specialize a generic Entity e.g. Making a Dataset also be a View or also be a LookerExplore

Parameters:

typeNames (List[str])

property typeNames: List[str]

The names of the specific types.

class datahub.metadata.schema_classes.SystemMetadataClass(lastObserved=None, runId=None, pipelineName=None, registryName=None, registryVersion=None, properties=None)

Bases: DictWrapper

Metadata associated with each metadata change that is processed by the system

Parameters:
  • lastObserved (Optional[int])

  • runId (Optional[str])

  • pipelineName (Optional[str])

  • registryName (Optional[str])

  • registryVersion (Optional[str])

  • properties (Optional[Dict[str, str]])

property lastObserved: int | None

The timestamp the metadata was observed at

property pipelineName: None | str

The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.

property properties: None | Dict[str, str]

Additional properties

property registryName: None | str

The model registry name that was used to process this event

property registryVersion: None | str

The model registry version that was used to process this event

property runId: str | None

The run id that produced the metadata. Populated in case of batch-ingestion.

class datahub.metadata.schema_classes.TagAssociationClass(tag, context=None)

Bases: DictWrapper

Properties of an applied tag. For now, just an Urn. In the future we can extend this with other properties, e.g. propagation parameters.

Parameters:
  • tag (str)

  • context (Optional[str])

property context: None | str

Additional context about the association

property tag: str

Urn of the applied tag

class datahub.metadata.schema_classes.TagKeyClass(name)

Bases: _Aspect

Key for a Tag

Parameters:

name (str)

property name: str

The tag name, which serves as a unique id

class datahub.metadata.schema_classes.TagPropertiesClass(name, description=None, colorHex=None)

Bases: _Aspect

Properties associated with a Tag

Parameters:
  • name (str)

  • description (Optional[str])

  • colorHex (Optional[str])

property colorHex: None | str

The color associated with the Tag in Hex. For example #FFFFFF.

property description: None | str

Documentation of the tag

property name: str

Display name of the tag

class datahub.metadata.schema_classes.TagSnapshotClass(urn, aspects)

Bases: DictWrapper

A metadata snapshot for a specific dataset entity.

Parameters:
property aspects: List[TagKeyClass | OwnershipClass | TagPropertiesClass | StatusClass]

The list of metadata aspects associated with the dataset. Depending on the use case, this can either be all, or a selection, of supported aspects.

property urn: str

URN for the entity the metadata snapshot is associated with.

class datahub.metadata.schema_classes.TelemetryClientIdClass(clientId)

Bases: _Aspect

A simple wrapper around a String to persist the client ID for telemetry in DataHub’s backend DB

Parameters:

clientId (str)

property clientId: str

A string representing the telemetry client ID

class datahub.metadata.schema_classes.TelemetryKeyClass(name)

Bases: _Aspect

Key for the telemetry client ID, only one should ever exist

Parameters:

name (str)

property name: str

The telemetry entity name, which serves as a unique id

class datahub.metadata.schema_classes.TestDefinitionClass(type, json=None)

Bases: DictWrapper

Parameters:
property json: None | str

JSON format configuration for the test

property type: str | TestDefinitionTypeClass

The Test Definition Type

class datahub.metadata.schema_classes.TestDefinitionTypeClass

Bases: object

JSON / YAML test def

JSON = 'JSON'
class datahub.metadata.schema_classes.TestInfoClass(name, category, definition, description=None)

Bases: _Aspect

Information about a DataHub Test

Parameters:
property category: str

Category of the test

property definition: TestDefinitionClass

Configuration for the Test

property description: None | str

Description of the test

property name: str

The name of the test

class datahub.metadata.schema_classes.TestKeyClass(id)

Bases: _Aspect

Key for a Test

Parameters:

id (str)

property id: str

Unique id for the test

class datahub.metadata.schema_classes.TestResultClass(test, type)

Bases: DictWrapper

Information about a Test Result

Parameters:
property test: str

The urn of the test

property type: str | TestResultTypeClass

The type of the result

class datahub.metadata.schema_classes.TestResultTypeClass

Bases: object

The Test Succeeded

FAILURE = 'FAILURE'
SUCCESS = 'SUCCESS'

The Test Failed

class datahub.metadata.schema_classes.TestResultsClass(failing, passing)

Bases: _Aspect

Information about a Test Result

Parameters:
property failing: List[TestResultClass]

Results that are failing

property passing: List[TestResultClass]

Results that are passing

class datahub.metadata.schema_classes.TextCellClass(cellId, changeAuditStamps, text, cellTitle=None)

Bases: DictWrapper

Text cell in a Notebook, which will present content in text format

Parameters:
property cellId: str

Unique id for the cell. This id should be globally unique for a Notebook tool even when there are multiple deployments of it. As an example, Notebook URL could be used here for QueryBook such as ‘querybook.com/notebook/773/?cellId=1234’

property cellTitle: None | str

Title of the cell

property changeAuditStamps: ChangeAuditStampsClass

Captures information about who created/last modified/deleted this Notebook cell and when

property text: str

The actual text in a TextCell in a Notebook

class datahub.metadata.schema_classes.TimeBasedRetentionClass(maxAgeInSeconds)

Bases: DictWrapper

Keep records that are less than X seconds old

Parameters:

maxAgeInSeconds (int)

property maxAgeInSeconds: int
class datahub.metadata.schema_classes.TimeStampClass(time, actor=None)

Bases: DictWrapper

A standard event timestamp

Parameters:
  • time (int)

  • actor (Optional[str])

property actor: None | str

The actor urn involved in the event.

Type:

Optional

property time: int

When did the event occur

class datahub.metadata.schema_classes.TimeTypeClass

Bases: DictWrapper

Time field type. This should also be used for datetimes.

class datahub.metadata.schema_classes.TimeWindowClass(startTimeMillis, length)

Bases: DictWrapper

Parameters:
property length: TimeWindowSizeClass

The length of the window.

property startTimeMillis: int

Start time as epoch at UTC.

class datahub.metadata.schema_classes.TimeWindowSizeClass(unit, multiple=None)

Bases: DictWrapper

Defines the size of a time window.

Parameters:
property multiple: int

How many units. Defaults to 1.

property unit: str | CalendarIntervalClass

Interval unit such as minute/hour/day etc.

class datahub.metadata.schema_classes.TrainingDataClass(trainingData)

Bases: _Aspect

Ideally, the MLModel card would contain as much information about the training data as the evaluation data. However, there might be cases where it is not feasible to provide this level of detailed information about the training data. For example, the data may be proprietary, or require a non-disclosure agreement. In these cases, we advocate for basic details about the distributions over groups in the data, as well as any other details that could inform stakeholders on the kinds of biases the model may have encoded.

Parameters:

trainingData (List[BaseDataClass])

property trainingData: List[BaseDataClass]

Details on the dataset(s) used for training the MLModel

class datahub.metadata.schema_classes.TransformationTypeClass

Bases: object

Type of the transformation involved in generating destination fields from source fields.

BLACKBOX = 'BLACKBOX'

Field transformation expressed as Identity function.

IDENTITY = 'IDENTITY'
class datahub.metadata.schema_classes.UDFTransformerClass(udf)

Bases: DictWrapper

Field transformation expressed in UDF

Parameters:

udf (str)

property udf: str

A UDF mentioning how the source fields got transformed to destination field. This is the FQCN(Fully Qualified Class Name) of the udf.

class datahub.metadata.schema_classes.UnionTypeClass(nestedTypes=None)

Bases: DictWrapper

Union field type.

Parameters:

nestedTypes (Optional[List[str]])

property nestedTypes: None | List[str]

List of types in union type.

class datahub.metadata.schema_classes.UpstreamClass(dataset, type, auditStamp=None, created=None, properties=None)

Bases: DictWrapper

Upstream lineage information about a dataset including the source reporting the lineage

Parameters:
property auditStamp: AuditStampClass

Audit stamp containing who reported the lineage and when.

property created: None | AuditStampClass

Audit stamp containing who created the lineage and when.

property dataset: str

The upstream dataset the lineage points to

property properties: None | Dict[str, str]

A generic properties bag that allows us to store specific information on this graph edge.

property type: str | DatasetLineageTypeClass

The type of the lineage

class datahub.metadata.schema_classes.UpstreamLineageClass(upstreams, fineGrainedLineages=None)

Bases: _Aspect

Upstream lineage of a dataset

Parameters:
property fineGrainedLineages: None | List[FineGrainedLineageClass]

List of fine-grained lineage information, including field-level lineage

property upstreams: List[UpstreamClass]

List of upstream dataset lineage information

class datahub.metadata.schema_classes.UrnForeignKeyClass(currentFieldPath)

Bases: DictWrapper

If SchemaMetadata fields make any external references and references are of type com.linkedin.pegasus2avro.common.Urn or any children, this models can be used to mark it.

Parameters:

currentFieldPath (str)

property currentFieldPath: str

Field in hosting(current) SchemaMetadata.

class datahub.metadata.schema_classes.UsageAggregationClass(bucket, duration, resource, metrics)

Bases: DictWrapper

Usage data for a given resource, rolled up into a bucket.

Parameters:
property bucket: int

Bucket start time in milliseconds

property duration: str | WindowDurationClass

Bucket duration

property metrics: UsageAggregationMetricsClass

Metrics associated with this bucket

property resource: str

Resource associated with these usage stats

class datahub.metadata.schema_classes.UsageAggregationMetricsClass(uniqueUserCount=None, users=None, totalSqlQueries=None, topSqlQueries=None, fields=None)

Bases: DictWrapper

Metrics for usage data for a given resource and bucket. Not all fields make sense for all buckets, so every field is optional.

Parameters:
property fields: None | List[FieldUsageCountsClass]

Field-level usage stats

property topSqlQueries: None | List[str]

Frequent SQL queries; mostly makes sense for datasets in SQL databases

property totalSqlQueries: None | int

Total SQL query count

property uniqueUserCount: None | int

Unique user count

property users: None | List[UserUsageCountsClass]

Users within this bucket, with frequency counts

class datahub.metadata.schema_classes.UserUsageCountsClass(count, user=None, userEmail=None)

Bases: DictWrapper

Records a single user’s usage counts for a given resource

Parameters:
  • count (int)

  • user (Optional[str])

  • userEmail (Optional[str])

property count: int
property user: None | str
property userEmail: None | str

If user_email is set, we attempt to resolve the user’s urn upon ingest

class datahub.metadata.schema_classes.ValueFrequencyClass(value, frequency)

Bases: DictWrapper

Parameters:
  • value (str)

  • frequency (int)

property frequency: int
property value: str
class datahub.metadata.schema_classes.VersionBasedRetentionClass(maxVersions)

Bases: DictWrapper

Keep max N latest records

Parameters:

maxVersions (int)

property maxVersions: int
class datahub.metadata.schema_classes.VersionInfoClass(version, versionType, customProperties=None, externalUrl=None)

Bases: _Aspect

Information about a Data processing job

Parameters:
  • version (str)

  • versionType (str)

  • customProperties (Optional[Dict[str, str]])

  • externalUrl (Optional[str])

property customProperties: Dict[str, str]

Custom property bag.

property externalUrl: None | str

URL where the reference exist

property version: str

The version which can indentify a job version like a commit hash or md5 hash

property versionType: str

The type of the version like git hash or md5 hash

class datahub.metadata.schema_classes.VersionTagClass(versionTag=None)

Bases: DictWrapper

A resource-defined string representing the resource state for the purpose of concurrency control

Parameters:

versionTag (Optional[str])

property versionTag: None | str
class datahub.metadata.schema_classes.ViewPropertiesClass(materialized, viewLogic, viewLanguage)

Bases: _Aspect

Details about a View. e.g. Gets activated when subTypes is view

Parameters:
  • materialized (bool)

  • viewLogic (str)

  • viewLanguage (str)

property materialized: bool

Whether the view is materialized

property viewLanguage: str

The view logic language / dialect

property viewLogic: str

The view logic

class datahub.metadata.schema_classes.WindowDurationClass

Bases: object

Enum to define the length of a bucket when doing aggregations

DAY = 'DAY'
HOUR = 'HOUR'
MONTH = 'MONTH'
WEEK = 'WEEK'
YEAR = 'YEAR'
datahub.metadata.schema_classes.get_schema_type(fullname)
Parameters:

fullname (str)

Return type:

RecordSchema