{ "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "title": "Metadata", "description": "MoJ Data Catalogue Metadata", "required": [ "columns" ], "properties": { "columns": { "type": "array", "title": "The columns in the table. An array of objects", "items": { "type": "object", "required": [ "name", "type" ], "properties": { "description": { "type": "string", "title": "A description of this field" }, "name": { "type": "string", "title": "The column name. Should be lower case with underscores, not spaces", "pattern": "[a-z0-9_]+", "examples": [ "employee_number" ] }, "type": { "type": "string", "title": "The data type. We use a limited set of data types for cross compatibility between Spark, R, Pandas etc. See lookup here: https://github.com/moj-analytical-services/dataengineeringutils/blob/master/dataengineeringutils/data/data_type_conversion.csv", "enum": [ "character", "int", "long", "float", "double", "decimal", "date", "datetime", "binary", "boolean", "struct", "array" ] }, "pattern": { "type": "string", "title": "regex pattern that can be used to validate data in this column" }, "enum": { "type": "array", "title": "An array of valid values that can exist in this column. Note NULL/None is not required, please use nullable property to define if column is nullable.", "examples": [ [ "Y", "N" ], [ 0, 1, 2, 3, 4 ], [ "England", "Northern Ireland", "Scotland", "Wales" ] ] }, "nullable": { "type": "boolean", "title": "Specifies if column is nullable (can have missing values) or not (cannot have missing values)" }, "sensitivity": { "type": "string", "title": "Specifies if the column contains personal data or special category data", "enum": [ "personal_data", "special_category_data" ] }, "redacted": { "type": "boolean", "title": "Specifies if the column is redacted or removed in non-sensitive versions of the table" } } } }, "description": { "type": "string", "title": "A description of what this table contains" }, "name": { "type": "string", "title": "The name of the table in the database" }, "data_format": { "type": "string", "title": "The format of the data in s3, and instruction on how to parse, see here https://github.com/moj-analytical-services/dataengineeringutils/blob/ae295caf93c75c80510abf0c74865939c94d3e70/dataengineeringutils/glue.py#L45", "enum": [ "avro", "csv", "csv_quoted_nodate", "regex", "orc", "par", "parquet", "json" ] }, "location": { "type": "string", "title": "The path to the data in s3. Usually, you should use path relative to the database root directory, unless the database contains tables spread across multiple buckets or directories", "examples": [ "sop_full/" ] }, "partitions": { "type": [ "null", "array" ], "title": "Columns used to partition the table" }, "primary_key": { "type": [ "null", "array" ], "title": "Columns that form the primary key of the table" }, "glue_specific": { "type": [ "null", "object" ], "title": "Dict used to add any additional table properties for glue catalogue. For an example see here: https://github.com/moj-analytical-services/etl_manager/blob/master/example/meta_data/db1/pay.json#L19" }, "sensitivity": { "type": "array", "title": "Specifies which types of sensitive data, if any, are contained in the table", "examples": [ [ "personal_data" ], [ "special_category_data" ], [ "personal_data", "special_category_data" ] ] } } }