{ "type": "object", "patternProperties": { "^[a-z0-9-_]+$": { "required": ["type"], "properties": { "type": { "type": "string", "enum": [ "PartitionedDataSet", "CachedDataSet", "MemoryDataSet", "LambdaDataSet", "networkx.NetworkXDataSet", "dask.ParquetDataSet", "geopandas.GeoJSONDataSet", "pillow.ImageDataSet", "json.JSONDataSet", "biosequence.BioSequenceDataSet", "tensorflow.TensorFlowModelDataset", "api.APIDataSet", "matplotlib.MatplotlibWriter", "yaml.YAMLDataSet", "pickle.PickleDataSet", "text.TextDataSet", "holoviews.HoloviewsWriter", "email.EmailMessageDataSet", "spark.SparkJDBCDataSet", "spark.SparkHiveDataSet", "spark.SparkDataSet", "pandas.AppendableExcelDataSet", "pandas.JSONDataSet", "pandas.SQLTableDataSet", "pandas.SQLQueryDataSet", "pandas.ParquetDataSet", "pandas.FeatherDataSet", "pandas.HDFDataSet", "pandas.CSVDataSet", "pandas.ExcelDataSet", "pandas.GBQTableDataSet", "pandas.GBQQueryDataSet", "pandas.GenericDataSet" ] } }, "allOf": [ { "if": { "properties": { "type": { "const": "PartitionedDataSet" } } }, "then": { "required": ["path", "dataset"], "properties": { "path": { "type": "string", "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." }, "dataset": { "pattern": ".*", "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." }, "filepath_arg": { "type": "string", "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." }, "filename_suffix": { "type": "string", "description": "If specified, only partitions that end with this\nstring will be processed." }, "credentials": { "type": "object", "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.17.0/05_data/02_kedro_io.html#partitioned-dataset-credentials" }, "load_args": { "type": "object", "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" } } } }, { "if": { "properties": { "type": { "const": "CachedDataSet" } } }, "then": { "required": ["dataset"], "properties": { "dataset": { "pattern": ".*", "description": "A Kedro DataSet object or a dictionary to cache." }, "copy_mode": { "type": "string", "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." } } } }, { "if": { "properties": { "type": { "const": "MemoryDataSet" } } }, "then": { "required": [], "properties": { "data": { "pattern": ".*", "description": "Python object containing the data." }, "copy_mode": { "type": "string", "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." } } } }, { "if": { "properties": { "type": { "const": "LambdaDataSet" } } }, "then": { "required": ["load", "save"], "properties": { "load": { "pattern": ".*", "description": "Method to load data from a data set." }, "save": { "pattern": ".*", "description": "Method to save data to a data set." }, "exists": { "pattern": ".*", "description": "Method to check whether output data already exists." }, "release": { "pattern": ".*", "description": "Method to release any cached information." } } } }, { "if": { "properties": { "type": { "const": "networkx.NetworkXDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to the NetworkX graph JSON file." }, "load_args": { "type": "object", "description": "Arguments passed on to ```networkx.node_link_graph``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html" }, "save_args": { "type": "object", "description": "Arguments passed on to ```networkx.node_link_data``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html" }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "dask.ParquetDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a parquet file\nparquet collection or the directory of a multipart parquet." }, "load_args": { "type": "object", "description": "Additional loading options `dask.dataframe.read_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet" }, "save_args": { "type": "object", "description": "Additional saving options for `dask.dataframe.to_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet" }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Optional parameters to the backend file system driver:\nhttps://docs.dask.org/en/latest/remote-data-services.html#optional-parameters" } } } }, { "if": { "properties": { "type": { "const": "geopandas.GeoJSONDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a GeoJSON file prefixed with a protocol like\n`s3://`. If prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "load_args": { "type": "object", "description": "GeoPandas options for loading GeoJSON files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference/geopandas.read_file.html" }, "save_args": { "type": "object", "description": "GeoPandas options for saving geojson files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference.html#geopandas.GeoDataFrame.to_file\nThe default_save_arg driver is 'GeoJSON', all others preserved." }, "credentials": { "type": "object", "description": "credentials required to access the underlying filesystem.\nEg. for ``GCFileSystem`` it would look like `{'token': None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." } } } }, { "if": { "properties": { "type": { "const": "pillow.ImageDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to an image file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "save_args": { "type": "object", "description": "Pillow options for saving image files.\nHere you can find all available arguments:\nhttps://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save\nAll defaults are preserved." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "json.JSONDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "save_args": { "type": "object", "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "biosequence.BioSequenceDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to sequence file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." }, "load_args": { "type": "object", "description": "Options for parsing sequence files by Biopython ``SeqIO.parse()``." }, "save_args": { "type": "object", "description": "file format supported by Biopython ``SeqIO.write()``.\nE.g. `{\"format\": \"fasta\"}`." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\n to pass to the filesystem's `open` method through nested keys\n `open_args_load` and `open_args_save`.\n Here you can find all available arguments for `open`:\n https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\n All defaults are preserved, except `mode`, which is set to `r` when loading\n and to `w` when saving.\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" } } } }, { "if": { "properties": { "type": { "const": "tensorflow.TensorFlowModelDataset" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a TensorFlow model directory prefixed with a\nprotocol like `s3://`. If prefix is not provided `file` protocol (local filesystem)\nwill be used. The prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "load_args": { "type": "object", "description": "TensorFlow options for loading models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/load_model\nAll defaults are preserved." }, "save_args": { "type": "object", "description": "TensorFlow options for saving models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/save_model\nAll defaults are preserved, except for \"save_format\", which is set to \"tf\"." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)." } } } }, { "if": { "properties": { "type": { "const": "api.APIDataSet" } } }, "then": { "required": ["url"], "properties": { "url": { "type": "string", "description": "The API URL endpoint." }, "method": { "type": "string", "description": "The Method of the request, GET, POST, PUT, DELETE, HEAD, etc..." }, "data": { "pattern": ".*", "description": "The request payload, used for POST, PUT, etc requests\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" }, "params": { "type": "object", "description": "The url parameters of the API.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls" }, "headers": { "type": "object", "description": "The HTTP headers.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#custom-headers" }, "auth": { "pattern": ".*", "description": "Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,\nor ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases." }, "json": { "pattern": ".*", "description": "The request payload, used for POST, PUT, etc requests, passed in\nto the json kwarg in the requests object.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" }, "timeout": { "type": "integer", "description": "The wait time in seconds for a response, defaults to 1 minute.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#timeouts" }, "credentials": { "pattern": ".*", "description": "Same as ``auth``. Allows specifying ``auth`` secrets in \ncredentials.yml." } } } }, { "if": { "properties": { "type": { "const": "matplotlib.MatplotlibWriter" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a matplot object file(s) prefixed with a protocol\nlike `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be\nused. The prefix should be any protocol supported by ``fsspec``." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" }, "save_args": { "type": "object", "description": "Save args passed to `plt.savefig`. See\nhttps://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html" } } } }, { "if": { "properties": { "type": { "const": "yaml.YAMLDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a YAML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "save_args": { "type": "object", "description": "PyYAML options for saving YAML files (arguments passed\ninto ```yaml.dump``). Here you can find all available arguments:\nhttps://pyyaml.org/wiki/PyYAMLDocumentation\nAll defaults are preserved, but \"default_flow_style\", which is set to False." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "pickle.PickleDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a Pickle file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "backend": { "type": "string", "description": "Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'." }, "load_args": { "type": "object", "description": "Pickle options for loading pickle files.\nHere you can find all available arguments for different backends:\npickle.load: https://docs.python.org/3/library/pickle.html#pickle.load\njoblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html\nAll defaults are preserved." }, "save_args": { "type": "object", "description": "Pickle options for saving pickle files.\nHere you can find all available arguments for different backends:\npickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump\njoblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html\nAll defaults are preserved." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." } } } }, { "if": { "properties": { "type": { "const": "text.TextDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "holoviews.HoloviewsWriter" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" }, "save_args": { "type": "object", "description": "Extra save args passed to `holoviews.save()`. See\nhttp://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save" } } } }, { "if": { "properties": { "type": { "const": "email.EmailMessageDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "load_args": { "type": "object", "description": "``email`` options for parsing email messages (arguments passed\ninto ``email.parser.Parser.parse``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser.parse\nIf you would like to specify options for the `Parser`,\nyou can include them under the \"parser\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser\nAll defaults are preserved, but \"policy\", which is set to ``email.policy.default``." }, "save_args": { "type": "object", "description": "``email`` options for generating MIME documents (arguments passed into\n``email.generator.Generator.flatten``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator.flatten\nIf you would like to specify options for the `Generator`,\nyou can include them under the \"generator\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator\nAll defaults are preserved." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "spark.SparkJDBCDataSet" } } }, "then": { "required": ["url", "table"], "properties": { "url": { "type": "string", "description": "A JDBC URL of the form ``jdbc:subprotocol:subname``." }, "table": { "type": "string", "description": "The name of the table to load or save data to." }, "credentials": { "type": "object", "description": "A dictionary of JDBC database connection arguments.\nNormally at least properties ``user`` and ``password`` with\ntheir corresponding values. It updates ``properties``\nparameter in ``load_args`` and ``save_args`` in case it is\nprovided." }, "load_args": { "type": "object", "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameReader.jdbc.html" }, "save_args": { "type": "object", "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html" } } } }, { "if": { "properties": { "type": { "const": "spark.SparkHiveDataSet" } } }, "then": { "required": ["database", "table", "write_mode"], "properties": { "database": { "type": "string", "description": "The name of the hive database." }, "table": { "type": "string", "description": "The name of the table within the database." }, "write_mode": { "type": "string", "description": "``insert``, ``upsert`` or ``overwrite`` are supported." }, "table_pk": { "type": "array", "description": "If performing an upsert, this identifies the primary key columns used to\nresolve preexisting data. Is required for ``write_mode=\"upsert\"``." } } } }, { "if": { "properties": { "type": { "const": "spark.SparkDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a Spark dataframe. When using Databricks\nand working with data written to mount path points,\nspecify ``filepath``s for (versioned) ``SparkDataSet``s\nstarting with ``/dbfs/mnt``." }, "file_format": { "type": "string", "description": "File format used during load and save\noperations. These are formats supported by the running\nSparkContext include parquet, csv. For a list of supported\nformats please refer to Apache Spark documentation at\nhttps://spark.apache.org/docs/latest/sql-programming-guide.html" }, "load_args": { "type": "object", "description": "Load args passed to Spark DataFrameReader load method.\nIt is dependent on the selected file format. You can find\na list of read options for each supported format\nin Spark DataFrame read documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" }, "save_args": { "type": "object", "description": "Save args passed to Spark DataFrame write options.\nSimilar to load_args this is dependent on the selected file\nformat. You can pass ``mode`` and ``partitionBy`` to specify\nyour overwrite mode and partitioning respectively. You can find\na list of options for each format in Spark DataFrame\nwrite documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" }, "credentials": { "type": "object", "description": "Credentials to access the S3 bucket, such as\n``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``.\nOptional keyword arguments passed to ``hdfs.client.InsecureClient``\nif ``filepath`` prefix is ``hdfs://``. Ignored otherwise." } } } }, { "if": { "properties": { "type": { "const": "pandas.AppendableExcelDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to an existing local Excel file." }, "load_args": { "type": "object", "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"openpyxl\"." }, "save_args": { "type": "object", "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html\nNote: `mode` option of `ExcelWriter` is set to `a` and it can not be overridden." } } } }, { "if": { "properties": { "type": { "const": "pandas.JSONDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "load_args": { "type": "object", "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." }, "save_args": { "type": "object", "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "pandas.SQLTableDataSet" } } }, "then": { "required": ["table_name", "credentials"], "properties": { "table_name": { "type": "string", "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." }, "credentials": { "type": "object", "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" }, "load_args": { "type": "object", "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" }, "save_args": { "type": "object", "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." } } } }, { "if": { "properties": { "type": { "const": "pandas.SQLQueryDataSet" } } }, "then": { "required": ["sql", "credentials"], "properties": { "sql": { "type": "string", "description": "The sql query statement." }, "credentials": { "type": "object", "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" }, "load_args": { "type": "object", "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" } } } }, { "if": { "properties": { "type": { "const": "pandas.ParquetDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a Parquet file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nIt can also be a path to a directory. If the directory is\nprovided then it can be used for reading partitioned parquet files.\nNote: `http(s)` doesn't support versioning." }, "load_args": { "type": "object", "description": "Additional options for loading Parquet file(s).\nHere you can find all available arguments when reading single file:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html\nHere you can find all available arguments when reading partitioned datasets:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read\nAll defaults are preserved." }, "save_args": { "type": "object", "description": "Additional saving options for `pyarrow.parquet.write_table` and\n`pyarrow.Table.from_pandas`.\nHere you can find all available arguments for `write_table()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table\nThe arguments for `from_pandas()` should be passed through a nested\nkey: `from_pandas`. E.g.: `save_args = {\"from_pandas\": {\"preserve_index\": False}}`\nHere you can find all available arguments for `from_pandas()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas" }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved." } } } }, { "if": { "properties": { "type": { "const": "pandas.FeatherDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a feather file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "load_args": { "type": "object", "description": "Pandas options for loading feather files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html\nAll defaults are preserved." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." } } } }, { "if": { "properties": { "type": { "const": "pandas.HDFDataSet" } } }, "then": { "required": ["filepath", "key"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a hdf file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "key": { "type": "string", "description": "Identifier to the group in the HDF store." }, "load_args": { "type": "object", "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." }, "save_args": { "type": "object", "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set `wb` when saving." } } } }, { "if": { "properties": { "type": { "const": "pandas.CSVDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "load_args": { "type": "object", "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." }, "save_args": { "type": "object", "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "pandas.GenericDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "file_format" : { "type": "string", "description": "The read/write methods to retrieve from pandas (`pandas.read_{file_format}` or `pd.DataFrame.to_{file_format}`) on a best effort basis." }, "load_args": { "type": "object", "description": "Pandas options for loading files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." }, "save_args": { "type": "object", "description": "Pandas options for saving files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." } } } }, { "if": { "properties": { "type": { "const": "pandas.ExcelDataSet" } } }, "then": { "required": ["filepath"], "properties": { "filepath": { "type": "string", "description": "Filepath in POSIX format to a Excel file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." }, "engine": { "type": "string", "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." }, "load_args": { "type": "object", "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"xlrd\"." }, "save_args": { "type": "object", "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under the \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" }, "credentials": { "type": "object", "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." }, "fs_args": { "type": "object", "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." } } } }, { "if": { "properties": { "type": { "const": "pandas.GBQTableDataSet" } } }, "then": { "required": ["dataset", "table_name"], "properties": { "dataset": { "type": "string", "description": "Google BigQuery dataset." }, "table_name": { "type": "string", "description": "Google BigQuery table name." }, "project": { "type": "string", "description": "Google BigQuery Account project ID.\nOptional when available from the environment.\nhttps://cloud.google.com/resource-manager/docs/creating-managing-projects" }, "credentials": { "pattern": ".*", "description": "Credentials for accessing Google APIs.\nEither ``google.auth.credentials.Credentials`` object or dictionary with\nparameters required to instantiate ``google.oauth2.credentials.Credentials``.\nHere you can find all the arguments:\nhttps://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html" }, "load_args": { "type": "object", "description": "Pandas options for loading BigQuery table into DataFrame.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html\nAll defaults are preserved." }, "save_args": { "type": "object", "description": "Pandas options for saving DataFrame to BigQuery table.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html\nAll defaults are preserved, but \"progress_bar\", which is set to False." } } } } ] } } }