/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Copyright by The HDF Group. * * All rights reserved. * * * * This file is part of HDF5. The full HDF5 copyright notice, including * * terms governing use, modification, and redistribution, is contained in * * the COPYING file, which can be found at the root of the source code * * distribution tree, or in https://www.hdfgroup.org/licenses. * * If you do not have access to either file, you may request a copy from * * help@hdfgroup.org. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ /* * This program illustrates the usage of HDF5's implicit message sharing * feature, which can be used to save space when the same messages are * used many times in a file. * * This example creates a standard file using file creation property lists * to control which messages are shared. Messages that can be shared are * datatypes, dataspaces, attributes, fill values, and filter pipelines. * */ #include #include "hdf5.h" #define NUM_DATASETS 40 const char *DSETNAME[] = {"dataset0", "dataset1", "dataset2", "dataset3", "dataset4", "dataset5", "dataset6", "dataset7", "dataset8", "dataset9", "dataset10", "dataset11", "dataset12", "dataset13", "dataset14", "dataset15", "dataset16", "dataset17", "dataset18", "dataset19", "dataset20", "dataset21", "dataset22", "dataset23", "dataset24", "dataset25", "dataset26", "dataset27", "dataset28", "dataset29", "dataset30", "dataset31", "dataset32", "dataset33", "dataset34", "dataset35", "dataset36", "dataset37", "dataset38", "dataset39", NULL}; herr_t create_standard_file(const char *filename, hid_t fcpl); /*------------------------------------------------------------------------- * Function: main * * Purpose: Enables shared messages using File Creation Property Lists * and creates files using these settings. * *------------------------------------------------------------------------- */ int main(void) { hid_t fcpl_id; herr_t ret; /* Create a file creation property list */ fcpl_id = H5Pcreate(H5P_FILE_CREATE); if (fcpl_id < 0) goto error; /* The file creation property list is the default list right now. * Create a file using it (this is the same as creating a file with * H5P_DEFAULT). Implicit shared messages will be disabled. */ ret = create_standard_file("default_file.h5", fcpl_id); if (ret < 0) goto error; /* There are five kinds of messages that can be shared: datatypes, * dataspaces, attributes, fill values, and filter pipelines. * Shared messages are stored in up to five "indexes," where each * index can contain one or more types of message. Using more indexes * will result in more overhead for sharing, but can also provide * more "tunability" and may affect caching performance. */ /* To begin with, use only one index. */ ret = H5Pset_shared_mesg_nindexes(fcpl_id, 1); if (ret < 0) goto error; /* Each index has a "minimum message size" for a message of that * type to be shared. Since sharing a message creates some overhead, * this is to prevent this overhead for very small messages when little * space would be saved by sharing them anyway. * If the content of the file isn't known beforehand, it's probably best * to set the minimum size "high"; over 100 or 200 bytes. If the content * of the file is known, this value can be used to trade space saved for * performance lost. The smaller this value is, the more messages will * be shared, so the more overhead will be incurred. * This value is in bytes. A shared message involves about 30 bytes of * overhead. Note that even messages that are only written once will * require this overhead (since they "might" be shared in the future), * so setting the minimum size too low may result in a file actually growing * in size. * For this example case, we'll set the minimum sharing size to be small * since we know that every message the "standard" file uses will be * repeated many times. */ /* The other property that each index has is the kinds of messages that * it holds. For the simple case, we'll put every message that could be * shared in this single index. */ ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_ALL_FLAG, 40); if (ret < 0) goto error; /* The other property that can be set for shared messages is the * list/B-tree cutoff for the indexes. * Each shared message index beins life as a simple list of messages * and becomes a B-tree when "too many" messages are written to it. * This keeps the indexes simple when only a few messages are shared, * but allows them to scale for many messages. If many messages are * deleted from the B-tree, it scales back down into a list. * A "reasonable" setting for maximum list size and minimum btree size * depends on what kinds of messages will be stored in the file. * These numbers are the same for all indexes in a file. * We'll guess at some numbers, though we could just as easily have kept * the default values. The first value is the maximum list size, the * second the minimum B-tree size. */ ret = H5Pset_shared_mesg_phase_change(fcpl_id, 30, 20); if (ret < 0) goto error; /* Now create a file with this property list. After the FCPL is used, * everything is automatic; messages will be shared and this will be * completely transparent to the user. Even if the file is closed * and re-opened, this settings will be saved and applied to messages * written later. */ ret = create_standard_file("one_index_file.h5", fcpl_id); if (ret < 0) goto error; /* Now try some variations on this. The FCPL hasn't been closed, so * we don't need to re-create it. * For instance, if we set the index to only share very large * messages, none of the messages we write will qualify and the file * will be about the same size as a normal file (with just a little extra * overhead). */ ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_ALL_FLAG, 1000); if (ret < 0) goto error; ret = create_standard_file("only_huge_mesgs_file.h5", fcpl_id); if (ret < 0) goto error; /* Or, suppose we only wanted to shared dataspaces and * attributes (which might make sense if we were going to use committed * datatypes). We could change the flags on the index: */ ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_SDSPACE_FLAG | H5O_SHMESG_ATTR_FLAG, 40); if (ret < 0) goto error; ret = create_standard_file("only_dspaces_and_attrs_file.h5", fcpl_id); if (ret < 0) goto error; /* We could create a second index and put attributes in it to separate them * from datatypes and dataspaces (and then run some performance metrics to * see whether this improved caching performance). */ ret = H5Pset_shared_mesg_nindexes(fcpl_id, 2); if (ret < 0) goto error; ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_DTYPE_FLAG | H5O_SHMESG_SDSPACE_FLAG, 40); if (ret < 0) goto error; ret = H5Pset_shared_mesg_index(fcpl_id, 1, H5O_SHMESG_ATTR_FLAG, 40); if (ret < 0) goto error; ret = create_standard_file("separate_indexes_file.h5", fcpl_id); if (ret < 0) goto error; /* We can try twiddling the "phase change" values and see what it does to * the file size. Since there's only a few different messages (two * datatypes, two dataspaces, and one attribute), using smaller lists will * save some space. */ ret = H5Pset_shared_mesg_nindexes(fcpl_id, 1); if (ret < 0) goto error; ret = H5Pset_shared_mesg_index(fcpl_id, 0, H5O_SHMESG_ALL_FLAG, 40); if (ret < 0) goto error; ret = H5Pset_shared_mesg_phase_change(fcpl_id, 5, 0); if (ret < 0) goto error; ret = create_standard_file("small_lists_file.h5", fcpl_id); if (ret < 0) goto error; /* Or we could create indexes that are never lists, but are created as * B-trees. We do this by setting the "maximum list size" to zero. */ ret = H5Pset_shared_mesg_phase_change(fcpl_id, 0, 0); if (ret < 0) goto error; ret = create_standard_file("btrees_file.h5", fcpl_id); if (ret < 0) goto error; /* Obviously there are a lot more permutations of these options possible. * Performance will often be a tradeoff of speed for space, but will * depend a great deal on the specific application. If performance is * important, the best thing to do is to play with these settings to find * the ones that work best for you. * Please let The HDF Group (help@hdfgroup.org) know what you find! */ /* Close the property list */ ret = H5Pclose(fcpl_id); if (ret < 0) goto error; return 0; error: return -1; } /*------------------------------------------------------------------------- * Function: create_standard_file * * Purpose: A helper function for the example. Creates an HDF5 file * with many repeated messages using the file creation * property list FCPL. * * This function only uses datatypes, dataspaces, and * attributes. Fill values and filter pipelines can also * be shared in the same way (i.e., by enabling sharing in * the FCPL and writing the same message more than once). *------------------------------------------------------------------------- */ herr_t create_standard_file(const char *filename, hid_t fcpl_id) { hid_t file_id = -1; hid_t type_id = -1, temp_type_id = -1; hsize_t dims[] = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; hid_t space_id = -1; hid_t attr_type_id = -1; hid_t attr_space_id = -1; int attr_data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}; hid_t dset_id = -1; hid_t attr_id = -1; int x; herr_t ret; /* Create the file */ file_id = H5Fcreate(filename, H5F_ACC_TRUNC, fcpl_id, H5P_DEFAULT); if (file_id < 0) goto error; /* Create the datatype we'll be using. Generally, sharing messages * is most useful when the message is complex and takes more space on * disk, so this type will be an array type rather than an atomic type. * However, any type can be shared. */ temp_type_id = H5Tarray_create2(H5T_NATIVE_INT, 2, dims); if (temp_type_id < 0) goto error; type_id = H5Tarray_create2(temp_type_id, 2, dims); if (type_id < 0) goto error; ret = H5Tclose(temp_type_id); if (ret < 0) goto error; /* Create the dataspace we'll be using. * Again, create a more complex dataspace so that more space will * be saved when we share it. */ space_id = H5Screate_simple(10, dims, dims); if (space_id < 0) goto error; /* Create a datatype and dataspace for the attributes we'll be creating. * The datatype will be a single integer, and each attribute will hold * 10 integers. */ attr_type_id = H5Tcopy(H5T_NATIVE_INT); if (attr_type_id < 0) goto error; attr_space_id = H5Screate_simple(1, dims, dims); if (attr_space_id < 0) goto error; /* Begin using the messages many times. Do this by creating datasets * that use this datatype, dataspace, and have this attribute. */ for (x = 0; x < NUM_DATASETS; ++x) { /* Create a dataset */ dset_id = H5Dcreate2(file_id, DSETNAME[x], type_id, space_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); if (dset_id < 0) goto error; /* Create an attribute on the dataset */ attr_id = H5Acreate2(dset_id, "attr_name", attr_type_id, attr_space_id, H5P_DEFAULT, H5P_DEFAULT); if (attr_id < 0) goto error; /* Write data to the attribute */ ret = H5Awrite(attr_id, H5T_NATIVE_INT, attr_data); if (ret < 0) goto error; ret = H5Aclose(attr_id); if (ret < 0) goto error; ret = H5Dclose(dset_id); if (ret < 0) goto error; } /* Close all open IDs */ ret = H5Tclose(attr_type_id); if (ret < 0) goto error; ret = H5Sclose(attr_space_id); if (ret < 0) goto error; ret = H5Tclose(type_id); if (ret < 0) goto error; ret = H5Sclose(space_id); if (ret < 0) goto error; ret = H5Fclose(file_id); if (ret < 0) goto error; return 0; error: return -1; }