{ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://doc.dataiku.com/schemas/dataiku/dataset.json", "title": "Dataiku DSS Dataset", "description": "A dataset in Dataiku DSS representing a structured data source or output, including its schema, connection parameters, format configuration, and flow integration settings.", "type": "object", "required": ["name", "type"], "properties": { "projectKey": { "type": "string", "description": "Project key that this dataset belongs to" }, "name": { "type": "string", "description": "Dataset name, unique within the project", "minLength": 1, "maxLength": 256 }, "type": { "type": "string", "enum": [ "Filesystem", "UploadedFiles", "PostgreSQL", "MySQL", "Oracle", "SQLServer", "Redshift", "BigQuery", "Snowflake", "Synapse", "Teradata", "S3", "GCS", "Azure", "HDFS", "Hive", "MongoDB", "Elasticsearch", "Cassandra", "HTTP", "FTP", "SCP", "Twitter", "Inline", "StatsDB", "JobsDB", "JDBC" ], "description": "Type of the dataset, determining the underlying storage or connection" }, "managed": { "type": "boolean", "description": "Whether the dataset is managed by DSS (output of a recipe) or external" }, "schema": { "$ref": "#/$defs/Schema" }, "formatType": { "type": "string", "enum": ["csv", "parquet", "json", "avro", "orc", "excel", "xml"], "description": "Data format for file-based datasets" }, "formatParams": { "type": "object", "description": "Format-specific parameters", "properties": { "separator": { "type": "string", "description": "Column separator for CSV files" }, "style": { "type": "string", "description": "CSV style (e.g., excel, unix, escaped)" }, "quoteChar": { "type": "string", "description": "Quote character for CSV" }, "escapeChar": { "type": "string", "description": "Escape character for CSV" }, "parseHeaderRow": { "type": "boolean", "description": "Whether to parse the first row as header" }, "charset": { "type": "string", "description": "Character encoding (e.g., utf8, latin1)" }, "compress": { "type": "string", "description": "Compression type (e.g., gz, bz2, snappy)" } } }, "params": { "type": "object", "description": "Type-specific connection and access parameters", "properties": { "connection": { "type": "string", "description": "DSS connection name for database-backed datasets" }, "table": { "type": "string", "description": "Database table name" }, "schema": { "type": "string", "description": "Database schema name" }, "catalog": { "type": "string", "description": "Database catalog name" }, "path": { "type": "string", "description": "File path for file-based datasets" }, "bucket": { "type": "string", "description": "Cloud storage bucket name" }, "notReadyIfEmpty": { "type": "boolean", "description": "Consider dataset not ready if it contains no data" } } }, "partitioning": { "$ref": "#/$defs/Partitioning" }, "flowOptions": { "type": "object", "description": "Options for how this dataset behaves in the flow", "properties": { "virtualizable": { "type": "boolean", "description": "Whether the dataset can be virtualized" }, "rebuildBehavior": { "type": "string", "enum": ["NORMAL", "WRITE_PROTECTED", "NO_REBUILD"], "description": "Rebuild behavior for the dataset" }, "crossProjectBuildBehavior": { "type": "string", "enum": ["DEFAULT", "BUILD", "NO_BUILD"], "description": "Build behavior when accessed from another project" } } }, "metrics": { "$ref": "#/$defs/MetricsSettings" }, "checks": { "$ref": "#/$defs/ChecksSettings" }, "creationTag": { "$ref": "#/$defs/VersionTag" }, "versionTag": { "$ref": "#/$defs/VersionTag" } }, "$defs": { "Schema": { "type": "object", "description": "Dataset schema defining columns and their types", "properties": { "columns": { "type": "array", "items": { "$ref": "#/$defs/Column" }, "description": "Ordered list of columns" }, "userModified": { "type": "boolean", "description": "Whether the schema was manually modified by a user" } } }, "Column": { "type": "object", "description": "A column in a dataset schema", "required": ["name", "type"], "properties": { "name": { "type": "string", "description": "Column name" }, "type": { "type": "string", "enum": [ "string", "bigint", "int", "smallint", "tinyint", "double", "float", "boolean", "date", "array", "map", "object", "geopoint", "geometry" ], "description": "Column data type" }, "meaning": { "type": "string", "description": "Semantic meaning assigned to this column (e.g., Email, URL, IPAddress)" }, "maxLength": { "type": "integer", "minimum": -1, "description": "Maximum length for string columns (-1 for unlimited)" }, "comment": { "type": "string", "description": "Documentation comment for the column" } } }, "Partitioning": { "type": "object", "description": "Partitioning configuration for the dataset", "properties": { "dimensions": { "type": "array", "items": { "type": "object", "properties": { "name": { "type": "string", "description": "Partition dimension name" }, "type": { "type": "string", "enum": ["value", "time"], "description": "Partition dimension type" }, "params": { "type": "object", "description": "Dimension-specific parameters", "properties": { "period": { "type": "string", "enum": ["YEAR", "MONTH", "DAY", "HOUR"], "description": "Time period for time-based partitions" } } } } }, "description": "Partition dimensions" }, "filePathPattern": { "type": "string", "description": "File path pattern for file-based partitioned datasets" } } }, "MetricsSettings": { "type": "object", "description": "Metrics computation settings", "properties": { "probes": { "type": "array", "items": { "type": "object", "properties": { "type": { "type": "string", "description": "Metric probe type" }, "enabled": { "type": "boolean", "description": "Whether this probe is enabled" }, "computeOnBuildMode": { "type": "string", "enum": ["NO", "PARTITION", "WHOLE_DATASET"], "description": "When to compute the metric during builds" } } } }, "displayedState": { "type": "object", "description": "Display state for metrics" } } }, "ChecksSettings": { "type": "object", "description": "Data quality check settings", "properties": { "checks": { "type": "array", "items": { "type": "object", "properties": { "type": { "type": "string", "description": "Check type" }, "name": { "type": "string", "description": "Check name" }, "meta": { "type": "object", "description": "Check metadata" }, "params": { "type": "object", "description": "Check parameters" } } } } } }, "VersionTag": { "type": "object", "description": "Version tracking information", "properties": { "versionNumber": { "type": "integer", "minimum": 0, "description": "Sequential version number" }, "lastModifiedBy": { "type": "object", "properties": { "login": { "type": "string", "description": "Login of the user who made the modification" } } }, "lastModifiedOn": { "type": "string", "format": "date-time", "description": "Timestamp of the last modification" } } } } }