chaosMonkeys: # Kills Hazelcast members. As of Feb. 2023/v0.8.0 of Hazeltest, the member killer monkey # can only target members running in a Kubernetes cluster, but you can choose between an # in-cluster access mode and an out-of-cluster access mode, see below. memberKiller: # Enables or disables the member killer monkey. enabled: true # Configures the number of runs or iterations the monkey will perform. Note that # number of runs != number of kills in case the chaos probability is set to something # less than 1.0 (100%), but the number evaluated to decide whether the monkey should # terminate is indeed the number of runs, and not the number of kills. numRuns: 100 # In each run, the monkey will strike with this probability. The provided number expresses # a percentage, i.e. 1.0 corresponds to 100%. For this value to be semantically correct, # therefore, it must be within the closed interval [0,1]. chaosProbability: 0.5 memberAccess: # Can take the value of one of its immediate object-type sub-keys, i.e. 'k8sOutOfCluster' or 'k8sInCluster' mode: k8sInCluster # Controls whether the member killer monkey should only consider active ("ready") members. If this is set to # true, the monkey will not kill Pods that haven't achieved readiness. targetOnlyActive: true # Mode for accessing Hazelcast members from outside the Kubernetes cluster. To connect to the Kubernetes cluster, # a kubeconfig file is used. # Handy when testing Hazeltest configurations locally prior to a deployment via Helm. However, since the absolute # path to the kubeconfig file can be specified by means of the corresponding property (see below), it might also # be used for in-cluster access by mounting the kubeconfig file into the Hazeltest Pod and specifying the path # accordingly. k8sOutOfCluster: # Specifies the absolute path to the kubeconfig file to be used to connect to the Kubernetes cluster # the target Hazelcast members run on. # If 'default' is given, Hazeltest will look for the Kubeconfig in $HOME/.kube/config. # In non-default cases, the absolute path of the file to be used can be specified. kubeconfig: default # The namespace in which to search for Hazelcast member Pods. namespace: hazelcastplatform # The label selector to use for identifying the Hazelcast member Pods. labelSelector: app.kubernetes.io/name=hazelcastplatform # Mode for accessing Hazelcast members from within the Kubernetes cluster. Useful when Hazeltest itself gets # deployed to Kubernetes because, while member access can be achieved with out-of-cluster mode in combination # with mounting the kubeconfig file and specifying its path accordingly, the in-cluster mode only relies on # native Kubernetes RBAC artifacts to delete Hazelcast member Pods. # (Note that for those artifacts to be rendered by the Helm chart, the 'features.useDeletePodsServiceAccount' # property in the chart's values.yaml file has to be set to 'true'.) # Hazeltest will attempt retrieve its current namespace by means of the POD_NAMESPACE environment variable and # the '/var/run/secrets/kubernetes.io/serviceaccount/namespace' file mounted into the Pod by Kubernetes. If the # namespace cannot be retrieved by these means, the monkey will report an error and terminate. # Please note that the implication of this is that, regardless of how the namespace is retrieved, the Hazeltest # Pod has to reside in the same namespace as the Hazelcast member Pods to be acted upon, i.e. cross-namespace # actions on Hazelcast member Pods are not supported. k8sInCluster: # Same as for out-of-cluster config. labelSelector: app.kubernetes.io/name=hazelcastplatform # Configures the monkey's sleep behavior for between runs. sleep: # Whether a sleep should be performed between runs. enabled: true # The duration, in seconds, to sleep. (Mind the difference between this sleep config's duration and the various # runner's sleeps, where the sleep is configured in milliseconds.) durationSeconds: 60 # Enables or disables randomness for this sleep configuration. If enabled, the amount of seconds to sleep will be # determined randomly in the closed interval [0, ]. If disabled, the sleep will be static, i.e. # the monkey will simply sleep for the given amount of seconds. enableRandomness: false # Configures deletion behavior. When enabled, the member killer monkey will kill the selected Hazelcast member # using a grace period as defined by 'durationSeconds' and 'enableRandomness'. memberGrace: # Whether members should be terminated gracefully at all. enabled: true # Number of seconds for the graceful shutdown, if member grace has been enabled. Randomness can be enabled for # this, however, see below. durationSeconds: 30 # Activates or deactivates randomness for the 'durationSeconds' property. The rules are the same as for the # sleep configuration explained above. enableRandomness: true # Caution: State cleaners will not modify data structures internal to Hazelcast itself. Such data structures # start with a prefix of two underscores, and state cleaners will skip all such data structures even if # two underscores have been explicitly provided using the prefix configuration below. This includes data structures # that are not internal to Hazelcast itself, but that are still prefixed by two underscores. # It is generally advisable not to assign names starting with two underscores to regular payload data structures. stateCleaners: maps: # Whether this state cleaner should be enabled. enabled: true # What to do about an error that occurs upon attempt to clean any of the identified data structures. # Can be one of 'ignore' or 'fail', where the former is the default. Usually, when a state cleaner encounters # an error, the cause is that the state cleaner of another Hazeltest instance currently holds the lock in the # map cleaner sync map (which has to be the same for all Hazeltest instances) for the payload map in # question. This error can be safely ignored -- as soon as the lock is released, any instance checking # whether the payload data structure should be cleaned would make a negative conclusion anyway because it has # just been cleaned (unless 'cleanAgainThreshold.enabled' was set to 'false', but that would be an unusual case # -- set the error behavior to 'fail' in this case so the Hazeltest instance fails fast and gets restarted). # It is for this reason that 'ignore', rather than 'fail', is the default error behavior. errorBehavior: ignore prefix: # Whether to enable prefix usage for identifying target maps. enabled: true # When prefix usage has been enabled, this prefix will be used to search for maps to clean in the entire set of # maps available on the target Hazelcast cluster (except those maps whose name starts with two underscores). prefix: "ht_" # Map cleaners across Hazeltest instances keep track of which payload maps they have cleaned at which timestamp. # This is the key to avoiding that a number of Hazeltest instances clean payload maps containing the state of # other instances in cases where payload map names are shared across these instances (which is the case for a # map runner when 'appendClientIdToMapName' was set to 'false' for this map runner). For example, when 100 Hazeltest # instances get created on a Kubernetes cluster in the form of Pods, Kubernetes won't create them all at precisely # the same time, but in small batches, whose creation happens milliseconds apart. Thus, without a means to check # when a payload data structure has been cleaned the last time, the later instances would clean payload maps that # are already in use by earlier instance, which would create havoc in these instances' map test loops. To prevent # this situation, a 'clean again threshold' can be configured, stating, for example, that any payload map identified # as a candidate for cleaning should only be cleaned if the given time, in milliseconds, has elapsed since the # map was cleaned the last time. cleanAgainThreshold: # Whether to apply the threshold specified by 'thresholdMs' to determine if a target payload map identified as # cleaning candidate is susceptible to cleaning. enabled: true # The threshold to apply to the cleaning decision. A candidate payload map will be cleaned if the difference # between the last cleaned timestamp and the current timestamp is greater than or equal to this number. thresholdMs: 30000 # See 'stateCleaners.maps' for an explanation on these properties. queues: enabled: true errorBehavior: ignore prefix: enabled: true prefix: "ht_" cleanAgainThreshold: enabled: true thresholdMs: 30000 queueTests: # 'queueTests.tweets' configures the TweetRunner. The TweetRunner has access to a file containing 500 tweets on # Marvel's "Avengers: Endgame" movie. This file is a simplified and shortened version of the original tweet collection, # which you can find here: https://www.kaggle.com/datasets/kavita5/twitter-dataset-avengersendgame tweets: # The TweetRunner will not be run when this is set to 'false'. enabled: true # The number of goroutines the TweetRunner will spawn to work on queues. (Depending on the configuration of the queue # names using the 'append*' properties, this may or may not correspond to a higher number of queues the runner will work on.) numQueues: 1 # If set to 'true', the TweetRunner will append the goroutine index of each of the queues to the latter's names, # and consequently each of those goroutines will work on its own queue in Hazelcast. Conversely, if this is set to # 'false', all queue goroutines in this particular Hazeltest instance will work on the same queue (which # might even be the same as the queue in use by other Hazeltest instances depending on the value for the # 'appendClientIdToQueueName' setting). # In contrast to map runners such as the PokedexRunner, the TweetRunner cannot distinguish the elements written to and # read from a queue by different goroutines due to the nature of a queue (there is no unique key for a value; instead, # values or just put and polled). Therefore, different queue goroutines act on each other's elements, and if queue # names are identical across Hazeltest instances due to how the two 'append*' properties have been set, the TweetRunners # across those instances will act on each other's elements. appendQueueIndexToQueueName: true # If set to 'true', the TweetRunner will append the unique client ID of this Hazeltest instance to the names of the # queue or queues it spawns. This will make sure TweetRunners across Hazeltest instances all work on distinct queues, # thus avoiding those runners acting on each other's elements. appendClientIdToQueueName: false queuePrefix: enabled: true # This prefix will be put in front of the queue name as it is without introducing any additional special characters. prefix: "ht_" # Configuration for the goroutine responsible for putting tweets into a Hazelcast queue. Each of the # goroutines will spawn one goroutine for performing put operations. putConfig: # Enable or disable put goroutine. enabled: true # The number of test loops the goroutine will perform for put operations. In each test loop, the goroutine will put # all tweets into a Hazelcast queue. Thus, specifying, for example, 10.000 runs means the goroutine will put # 10.000 * 500 tweets in total. # You may have noticed the semantics of specifying the number of test loops for this queue runner is different # compared to the map runners -- for the latter, one configures the number of test loops only once, and the test loop # operations (ingest, read, delete) are performed sequentially. For this queue runner, on the other hand, the queue # operations (put and poll) are decoupled in that they run in their own goroutines, so the ability to configure # the number of test loops individually for puts and polls directly translates to a higher degree of versatility # regarding the client behavior this queue runner can simulate. numRuns: 10000 batchSize: 50 sleeps: # Makes a put goroutine sleep once before performing the first put operation. initialDelay: enabled: false durationMs: 2000 # Enables or disables randomness for this sleep configuration. If enabled and assuming the sleep itself is # enabled, the amount of milliseconds to sleep will be determined randomly in the closed interval # [0, ]. If disabled, on the other hand, the sleep will be static, i.e. the goroutine for this # runner will simply sleep for the given amount of milliseconds. # Hint: For queue runners, it makes sense to disable randomness for the initial sleep, as a reduced amount of # milliseconds slept due to randomness might invalidate the purpose of the initial sleep, which is to give the # operation in question, normally the put operation, enough time to put some elements into its queues before # the poll operations start. enableRandomness: false # Causes a put goroutine to sleep after performing one batch of put operations, where the batch size is # configured by means of the above '(...).putConfig.batchSize' property. betweenActionBatches: enabled: true durationMs: 1000 enableRandomness: true # Makes a put goroutine pause execution before each run. betweenRuns: enabled: true durationMs: 2000 enableRandomness: true # Configuration for the goroutine running poll operations for tweets against the Hazelcast queue. Just like with the # put operations, each of the goroutines will spawn a dedicated goroutine for polling operations, too. pollConfig: # Enable or disable polling goroutine. enabled: true numRuns: 10000 batchSize: 50 # Same as for putConfig sleeps: initialDelay: enabled: true durationMs: 12500 enableRandomness: false betweenActionBatches: enabled: true durationMs: 1000 enableRandomness: true betweenRuns: enabled: true durationMs: 2000 enableRandomness: true load: enabled: true numQueues: 5 # Controls how many entries the queue load runner will use (this property is not available on the tweet runner because # the tweet runner works on a static data set, hence the number of elements as well as their size are given). The # queue load runner will create string entries to iterate over them for each of the queues # in each of the runs of its put and poll configurations. numLoadEntries: 5000 # Configures the size, in bytes, for the random string to be used as each load entry (to reduce Hazeltest's memory # footprint, only one random string of size is created, and it will serve as the payload for each # of the entries) payloadSizeBytes: 5000 appendQueueIndexToQueueName: true appendClientIdToQueueName: false queuePrefix: enabled: true prefix: "ht_" putConfig: enabled: true numRuns: 10000 batchSize: 50 sleeps: initialDelay: enabled: false durationMs: 2000 enableRandomness: false betweenActionBatches: enabled: true durationMs: 200 enableRandomness: true betweenRuns: enabled: true durationMs: 200 enableRandomness: true pollConfig: enabled: true numRuns: 10000 batchSize: 50 sleeps: initialDelay: enabled: true durationMs: 20000 enableRandomness: false betweenActionBatches: enabled: true durationMs: 200 enableRandomness: true betweenRuns: enabled: true durationMs: 200 enableRandomness: true mapTests: pokedex: # If set to 'false', the PokedexRunner will not be executed enabled: true # The runner will spawn one goroutine for each map numMaps: 10 # If set to 'true', each of the goroutines will use its own map name, thus effectively accessing its own map # In other words, if this is set to 'true', this PokedexRunner will use distinct maps in Hazelcast; if it is set # to 'false', the goroutines will access the same map # For the PokedexRunner, this will result in a higher number of maps, each containing a smaller number of keys # Note: Both the client ID and the currently active goroutine/number are still part of the map keys, so no matter how # the following two properties are set, different clients and goroutines within clients will not act on each other's keys appendMapIndexToMapName: true # If so to 'true', the PokedexRunner will append the ID of its Hazeltest instance to the map name such that it gets its own map in Hazelcast; # if set to 'false', then this PokedexRunner will share its maps with the maps created by PokedexRunners in other Hazeltest instances # Set this to 'false' if you would like to make all Hazeltest instances access the same map or maps appendClientIdToMapName: false # The number of test loops (e.g., ingest-read-delete) to execute in each map goroutine numRuns: 10000 performPreRunClean: # Whether to clean all maps for this runner prior to the runner's test loop launching. For example, if 'numMaps' is # 10, this property will ensure the maps the runner's test loop will act upon are cleaned of all entries before # the test loop starts. enabled: false # What to do when cleaning a target map fails. Can be either 'ignore' or 'fail'. Default is 'ignore'. # 'ignore': Treats the error as a warning. The warning will be logged, but the map runner will # nonetheless start with its test loop. # 'fail': The error will be treated as irrecoverable failure, meaning the runner won't start its test loop, and # instead terminate immediately. errorBehavior: ignore cleanAgainThreshold: # Whether to apply the threshold given below to determine whether a target map is already susceptible to # getting cleaned. enabled: true # If threshold usage is enabled, pre-run cleaning will only clean a target map if the given duration of # milliseconds has elapsed since the last time a cleaning operation was performed on this map. thresholdMs: 30000 mapPrefix: enabled: true # The prefix will be put in front of the map name as-is, so no additional underscores or other # characters will be added prefix: "ht_" # Using sleeps, the pace with which this runner interacts with Hazelcast can be slowed down sleeps: # Can be enabled to make the test loop sleep for the given duration after each of the . One run consists of three actions (e.g. ingest-read-delete), # so use this setting to tell the test loop to sleep after having finished the last action of run and before commencing with the first action of run betweenRuns: enabled: true durationMs: 2000 enableRandomness: true testLoop: # Can take the value of one of the sub-objects below, i.e. either 'batch' or 'boundary' type: boundary # The batch test loop offers comparatively simple logic to perform map actions in Hazelcast -- it inserts all # elements of the source data, then reads all items thus ingested, removes some of them, and then starts all over. # So, it works in batches of operations -- hence its name. # The batch test loop is sufficient to maximize both the CPU and memory load on a Hazelcast cluster -- specifically # in combination with the LoadRunner, not so much the PokedexRunner --, but it does not offer enough fine-tuning # options for the generated load to be very realistic (i.e. the load patterns generated on the target Hazelcast # cluster probably won't resemble load a typical client application might induce). batch: sleeps: # If this is enabled, the test loop will wait for (or a duration in the interval [1, betweenActionBatches: enabled: true durationMs: 5000 enableRandomness: true # The boundary test loop was introduced to offer more sophisticated load generation logic aiming at inducing # load realistic enough to resemble the load patterns typical client applications might generate in # the day-to-day operation of a production environment. Thus, its goal is to offer the means for simulating # typical production traffic such that the Hazelcast cluster under test can be exposed to production-like # traffic in order to deliver proof that the cluster's configuration, bundled in whatever sort of release candidate # you use (e.g. a Helm chart), is actually fit for production, and to deliver this proof BEFORE the configuration # ever saw the light of a production environment. # The boundary test loop is so called because it revolves around "fill boundaries" as defined by the number # of items in the runner's data source in combination with an upper and a lower fill percentage (i.e., an # upper and a lower boundary the runner should respect). # For example, given the 151 source elements of the PokedexRunner and assuming an upper fill percentage # of 0.8 (80 %) as well as a lower fill percentage of 0.2 (20 %), the runner will execute insert and remove # operations so the number of elements present in the target map on the Hazelcast cluster under test will always # remain within the boundaries thus defined (here: 151*0.8=121 elements (rounded) as the upper boundary, and # 151*0.2=30 (rounded) as the lower). After each state-changing operation (insert or remove), the runner # will perform a read operation. # Each of the runner's goroutines will only consider its own key-value pairs, so even if the number of # maps (which internally translates to the number of goroutines) is set to something greater than 1 and the # runner is instructed to not append the map number to the map (such that all map goroutines write to the # same map), each of the runner's boundary test loop goroutines will still only work on # its own key-value pairs. # Iterating up and down between the map fill boundaries is carried out in scope of a so-called operation chain. # An operation chain represents the idea that the test loop should move over its source data multiple times in order # to have enough possibility to hit both the upper and the lower boundary a couple of times, and perform enough # operations of all kinds (insert, read, remove), in between for the load thus generated to be realistic. # Take, for example, the PokedexRunner's data source, which is the 151 Pokémon of the first-generation Pokédex. # With a chain length of 1510, the test loop will iterate between the upper and the lower boundary precisely ten times # assuming an upper boundary of 100 % and a lower boundary of 0 %, and the number of iterations increases when the # "boundary window" is configured to be more narrow. For this reason, the chain length should typically be # higher than the number of elements in the source data. # In order to introduce more randomness to the order of operations, the boundary test loop can be configured # with a probability to execute an operation towards the boundary (an "action towards boundary probability"). # For example, if the test loop has determined it should fill the target map and the action towards boundary # probability is configured to be 0.7 (70 %), then it will perform an insert operation with a probability of # only 70 %. Hence, even in "fill mode", the action to be performed could be a remove (unless this would # cause the lower map fill boundary to be violated, in which case the runner will override the probability and # force an insert). boundary: sleeps: # Whether the runner's boundary test loop should sleep after having executed an operation chain. betweenOperationChains: enabled: true durationMs: 5000 enableRandomness: true # Whether the runner's boundary test loop should sleep after having performed one operation within the # operation chain, e.g. an insert. afterChainAction: enabled: true durationMs: 500 enableRandomness: true # Whether the runner's boundary test loop should sleep whenever it changed its map fill mode. For example, if # the previous map fill mode was 'fill', the upper fill boundary threshold was reached and the test loop # now switches to 'drain', this sleep will tell it to wait before moving on. # This sleeping behavior is useful for letting metrics collection systems (such as Prometheus or the # Hazelcast Management Center itself) catch up and read the current map metrics before the next actions # of the test loop begin altering them again. For example, by configuring a long-enough sleep, you can make # sure the map fill states at the point of mode change are correctly recorded by the metrics system. (For this # reason, randomness is disabled on this sleep by default, but you can enable it if the precision of metrics # collection is not important in your use case.) uponModeChange: enabled: true durationMs: 6000 enableRandomness: false # The definition of the test loop's operation chain. operationChain: # Defines how often the test loop will "bounce" between the upper and the lower map fill boundary. # The higher this value, the more often both boundaries will be hit (though the "distance" between the # boundaries also influences how many boundary hits will occur in one chain). # This value should be greater than the number of elements in the runner's data source. length: 1_000 # The test loop keeps track of the state of the key-value pairs in the target Hazelcast map as well # as of the actions it has performed (keeping track of this state is necessary to decide which map # action should be performed next, e.g. to avoid violating a map fill boundary). This property controls # whether both the local and the remote state should be cleared after having run through one operation chain. # (Here, clearing the remote state means the test loop will clear all values in the target map it is # responsible for, leaving the values of other test loops of the same runner untouched.) # If this is set to "false", then the transition from one operation chain to the next will be seamless, # i.e. from looking at the number of elements in the target Hazelcast map, you couldn't tell whether the # runner's test loop has begun a new operation chain, as it will neither reset its local nor the remote state # in the target Hazelcast map. resetAfterChain: true # The boundary definition this boundary test loop will use to determine the degree to which it should fill # the target Hazelcast map. # (You can configure all map goroutines (and hence each test loop) to work on the same Hazelcast map by # setting the "appendMapIndexToMapName" property to "false", but since each test loop will only consider # its own values, it will still respect those boundaries in relation to the number of items under its # responsibility.) boundaryDefinition: # The upper map fill boundary. # Example: Setting this to 0.8 (80 %) on the PokedexRunner's 151 data source elements will result in this # test loop not ingesting more than 121 elements. upper: # The map fill percentage that defines the upper map boundary in relation to the number # of elements in the runner's data source. mapFillPercentage: 0.8 # Whether the map fill percentage should be determined randomly before the start of each new # operation chain. If set to "true", the random value will be taken from the half-open interval # (, 1.0]. enableRandomness: true lower: # The map fill percentage that defines the lower map boundary in relation to the number # of elements in the runner's data source. mapFillPercentage: 0.2 # If set to "true", the random value will be taken from the half-open interval # [0.0, ). enableRandomness: true # The probability with which to execute an action towards the boundary. # If the test loop has determined it should currently fill the map, then this setting controls # the probability for an insert operation to be performed (here, the insert would be the action # towards the boundary because in "fill mode" the test loop aims for the upper boundary). The exception # to the rule is when the test loop determines it has reached a boundary, in which case it will enforce # an action to move away from it (e.g. a remove operation in case the upper map fill boundary has been # reached to not violate this boundary), thus overriding the probability setting. actionTowardsBoundaryProbability: 1 load: enabled: true numMaps: 10 # In contrast to the PokedexRunner, whose data set is limited by the number of Pokémon contained in the first-generation Pokédex (151), the LoadRunner can create # arbitrarily many entries, and the 'numEntriesPerMap' controls how many entries it will create numEntriesPerMap: 50000 # Whereas the PokedexRunner uses a pre-defined data set and hence cannot offer the option for adjusting the payload # size of the keys it writes, the LoadRunner generates a string to be used as the payload, and the properties # below can be used to adjust the manner and size in which this string is generated. payload: fixedSize: # Whether to use a fixed-size payload. Enabling this will make the LoadRunner generate one string with a # size of 'sizeBytes' and pre-populate all load elements to be used in scope of the LoadRunner's # test loop. Hence, each key-value pair thus written will use the same string payload. Enabling fixed-size # payloads is equivalent to the LoadRunner's functionality in all versions of Hazeltest prior to 0.15.0. enabled: false sizeBytes: 10000000 variableSize: # Whether to use variable-size payloads for the LoadRunner's test loop. If enabled, the properties below # are used to configure generating a string-based payload. The size of the generated payload will be within # the closed interval of [lower boundary, upper boundary], as specified by the two corresponding properties below. enabled: true # The lower boundary (inclusive) of the size, in bytes, to use for payload generation. lowerBoundaryBytes: 1000 # The upper boundary (also inclusive) of the size, in bytes, to use for payload generation. upperBoundaryBytes: 10000 # The steps after which the size of the payload should be re-evaluated. If set to 100, for example, an evaluated # size within the [lower, upper] boundary will be used for 100 write operations (emphasis on "write" operations; # a read or delete operation will not increase the counter). Set this to 1 if you would like the LoadRunner's test # loop to use a random size within the [lower, upper] boundary for each write operation. evaluateNewSizeAfterNumWriteActions: 100 appendMapIndexToMapName: true appendClientIdToMapName: false numRuns: 10000 performPreRunClean: enabled: false errorBehavior: ignore cleanAgainThreshold: enabled: true thresholdMs: 30000 mapPrefix: enabled: true prefix: "ht_" sleeps: betweenRuns: enabled: true durationMs: 2000 enableRandomness: true testLoop: type: boundary batch: sleeps: afterBatchAction: enabled: true durationMs: 10 enableRandomness: true betweenActionBatches: enabled: true durationMs: 2000 enableRandomness: true boundary: sleeps: betweenOperationChains: enabled: true durationMs: 1000 enableRandomness: true afterChainAction: enabled: true durationMs: 50 enableRandomness: false uponModeChange: enabled: true durationMs: 15000 enableRandomness: false operationChain: length: 2000 resetAfterChain: true boundaryDefinition: upper: mapFillPercentage: 0.9 enableRandomness: true lower: mapFillPercentage: 0.2 enableRandomness: true actionTowardsBoundaryProbability: 0.9