# Grafana Dashboard YAML format # This YAML config file has format shown below. # # Explanation of entries: # ======================= # - section: Section heading # - row: Indicates a new row (future will allow multiples per row) # - title: Title of the panel # - expr: # is the PromQL query - note that variables are frefixed by '$' # # In the case of the flag --no-sdp, then this will be removed: 'sdpinst="$sdpinst",' # - expr: p4_monitor_by_cmd{sdpinst="$sdpinst",serverid="$serverid"} # # legend is to ensure the legend shows relevant fields from metrics labels # # For more information on options - see 'Labels' column in: https://github.com/perforce/p4prometheus#metrics-available # legend: "{{cmd}}" # Example: # - section: Monitor Tracking # - row: 1 # - title: Monitor Processes (by cmd) # target: # - expr: p4_monitor_by_cmd{sdpinst="$sdpinst",serverid="$serverid"} # legend: "{{cmd}}" # # Extra fields: # type: gauge # Note default is counter # yformat: s # good for setting time values (seconds/hours/weeks/...) # decbytes - byte values # percent - 1-100% #======================================================================== # Expected customisations: # * Check the hardcoded values for replication in 'expr' and adjust for your environment: # Anything with the value '(_UPDATE_FOR_YOUR_SITE_)' # E.g. # # - title: Replica Lag for SAMPLE_REPLICA from SAMPLE_MASTER (_UPDATE_FOR_YOUR_SITE_) # target: # - expr: >- # p4_replica_curr_pos{instance="SAMPLE_MASTER_instance",sdpinst="1",servername="SAMPLE_MASTER"} - # ignoring (serverid, servername) # p4_replica_curr_pos{instance="SAMPLE_MASTER_instance",sdpinst="1",servername="SAMPLE_REPLICA"} #======================================================================== # The following counters not included (for now) # p4_cmd_ip_counter # p4_cmd_ip_cumulative_seconds # p4_cmd_running # p4_cmd_user_counter # p4_cmd_user_cumulative_seconds # p4_cmd_user_detail_counter # p4_cmd_user_detail_cumulative_seconds - section: Disk Space (_UPDATE_FOR_YOUR_SITE_ - check mountpoints) - row: 1 - title: Free Space Alert - filesys.*.min target: - expr: node_filesystem_free_bytes{mountpoint="/hxmetadata"} - on (customer, instance) p4_filesys_min{filesys="P4ROOT"} legend: "hxmetadata above min" - expr: node_filesystem_free_bytes{mountpoint="/hxlogs"} - on (customer, instance) p4_filesys_min{filesys="P4LOG"} legend: "hxlogs above min" - expr: node_filesystem_free_bytes{mountpoint="/hxdepots"} - on (customer, instance) p4_filesys_min{filesys="depot"} legend: "hxdepots above min" - expr: p4_filesys_min{customer="$customer", filesys=~"depots|P4ROOT|P4LOG"} legend: "min {{filesys}}" yformat: decbytes - title: Filesystem percentage used (_UPDATE_FOR_YOUR_SITE_ - check mountpoints) target: - expr: >- 100.0 - 100 *(node_filesystem_avail_bytes{mountpoint=~"/hx.*"}) / (node_filesystem_size_bytes{mountpoint=~"/hx.*"}) legend: "{{mountpoint}}" yformat: percent - section: Monitor Tracking - row: 1 - title: Monitor Processes (by cmd) target: - expr: p4_monitor_by_cmd{sdpinst="$sdpinst",serverid="$serverid"} legend: "{{cmd}}" - expr: sum(p4_monitor_by_cmd{sdpinst="$sdpinst",serverid="$serverid"}) legend: all - title: Monitor Processes (by user) target: - expr: p4_monitor_by_user{sdpinst="$sdpinst",serverid="$serverid"} legend: "{{user}}" - expr: sum(p4_monitor_by_user{sdpinst="$sdpinst",serverid="$serverid"}) legend: all - row: 1 - title: p4d process count target: - expr: p4_process_count - title: rtv sessions active target: - expr: p4_rtv_svr_sessions_active - row: 1 - title: $serverid Time for last SDP checkpoint target: - expr: p4_sdp_checkpoint_duration{sdpinst="$sdpinst",serverid="$serverid"} yformat: s - title: Uptime type: gauge target: - expr: p4_server_uptime{sdpinst="$sdpinst"} yformat: s - title: $serverid time since last SDP checkpoint target: - expr: time() - p4_sdp_checkpoint_log_time{sdpinst="$sdpinst",serverid="$serverid"} yformat: s - row: 1 - title: All P4 Cmds Count (rate/10min) target: - expr: rate(p4_completed_cmds_per_day{sdpinst="$sdpinst",serverid="$serverid"}[10m]) - expr: sum(rate(p4_cmd_counter{sdpinst="$sdpinst",serverid="$serverid"}[10m])) - title: p4d log lines read (rate/min) target: - expr: rate(p4_prom_log_lines_read{sdpinst="$sdpinst",serverid="$serverid"}[1m]) - row: 1 - title: p4prometheus cmds pending target: - expr: p4_prom_cmds_pending{sdpinst="$sdpinst",serverid="$serverid"} - title: p4prometheus cmds processed (rate/min) target: - expr: rate(p4_prom_cmds_processed{sdpinst="$sdpinst",serverid="$serverid"}[1m]) - row: 1 - title: p4prometheus CPU system (rate/min) target: - expr: rate(p4_prom_cpu_system{sdpinst="$sdpinst",serverid="$serverid"}[1m]) - title: p4prometheus CPU user (rate/min) target: - expr: rate(p4_prom_cpu_user{sdpinst="$sdpinst",serverid="$serverid"}[1m]) - row: 1 - title: Error Count rates by subsystem/id target: - expr: rate(p4_error_count{sdpinst="$sdpinst",serverid="$serverid",subsystem!~"[0-9].*"}[1m]) legend: subsys {{subsystem}}, level {{level}}, id {{error_id}} - row: 1 - title: P4 cmds by program - count (rate/10min) target: - expr: rate(p4_cmd_program_counter{sdpinst="$sdpinst",serverid="$serverid"}[10m]) legend: "{{program}}" - title: P4 cmds by program - duration (rate/10min) target: - expr: rate(p4_cmd_program_cumulative_seconds{sdpinst="$sdpinst",serverid="$serverid"}[10m]) legend: "{{program}}" - section: Replication - row: 1 - title: Replica Journal number (from servers -J on master) target: - expr: p4_replica_curr_jnl{sdpinst="$sdpinst",serverid="$serverid"} legend: "{{servername}}" - title: Replica Journal Pos (from servers -J on master) target: - expr: p4_replica_curr_pos{sdpinst="$sdpinst",serverid="$serverid"} legend: "{{servername}}" # This is replaced by the metrics below - because this requires subtracting with named instances # - row: 1 # - title: Replica Lag for SAMPLE_REPLICA from SAMPLE_MASTER (_UPDATE_FOR_YOUR_SITE_) # target: # - expr: >- # p4_replica_curr_pos{sdpinst="$sdpinst",instance="SOME_MASTER_INSTANCE",servername="SAMPLE_MASTER"} - # ignoring (serverid, servername) # p4_replica_curr_pos{sdpinst="$sdpinst",instance="SOME_REPLICA_INSTANCE",servername="SAMPLE_REPLICA"} - row: 1 - title: Pull replica journals behind target: - expr: p4_pull_replica_journals_behind{sdpinst="$sdpinst"} - title: Replica lag (bytes) target: - expr: p4_pull_replica_lag{sdpinst="$sdpinst"} yformat: decbytes - row: 1 - title: Pull queue size on replica target: - expr: p4_pull_queue{sdpinst="$sdpinst"} - title: rtv_repl_behind_bytes target: - expr: p4_rtv_rpl_behind_bytes{instance="$sdpinst"} - row: 1 - title: Pull queue errors target: - expr: p4_pull_errors{sdpinst="$sdpinst",serverid="$serverid"} - title: Replication errors target: - expr: p4_pull_replication_error{sdpinst="$sdpinst",serverid="$serverid"} - row: 1 - title: Cmd counts per replica IP rate/min target: - expr: rate(p4_cmd_replica_counter{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "{{replica}}" - title: Cmd counts per replica IP duration/min target: - expr: rate(p4_cmd_replica_cumulative_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "{{replica}}" - row: 1 - title: Files synced (add/update/delete) per min target: - expr: rate(p4_sync_files_added{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "added" - expr: rate(p4_sync_files_updated{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "updated" - expr: rate(p4_sync_files_deleted{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "deleted" - title: Bytes synced (add/update) per min target: - expr: rate(p4_sync_bytes_added{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "added" yformat: decbytes - expr: rate(p4_sync_bytes_updated{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "updated" yformat: decbytes - section: Cmd Count and Duration - row: 1 - title: Cmds duration (rate/min) target: - expr: rate(p4_cmd_cumulative_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "{{cmd}}" - title: p4 cmds top 10 (rate/min) target: - expr: sum without (instance, job)(rate(p4_cmd_counter{sdpinst="$sdpinst",serverid="$serverid"}[1m])) legend: "{{cmd}}" - row: 1 - title: Cmds CPU System (rate/min) target: - expr: rate(p4_cmd_cpu_system_cumulative_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "{{cmd}}" - title: Cmds CPU User (rate/min) target: - expr: rate(p4_cmd_cpu_user_cumulative_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "{{cmd}}" - row: 1 - title: Cmd Error Counter (rate/min) target: - expr: rate(p4_cmd_error_counter{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "{{cmd}}" - title: Trigger duration (rate/min) target: - expr: rate(p4_total_trigger_lapse_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m]) legend: "{{trigger}}" - section: Table Locking - row: 1 - title: P4 Read Locks target: - expr: p4_locks_db_read{sdpinst="$sdpinst",serverid="$serverid"} legend: db - expr: p4_locks_cliententity_read{sdpinst="$sdpinst",serverid="$serverid"} legend: cliententity - expr: p4_locks_meta_read{sdpinst="$sdpinst",serverid="$serverid"} legend: meta - title: P4 Write Locks target: - expr: p4_locks_db_write{sdpinst="$sdpinst",serverid="$serverid"} legend: db - expr: p4_locks_cliententity_write{sdpinst="$sdpinst",serverid="$serverid"} legend: cliententity - expr: p4_locks_meta_write{sdpinst="$sdpinst",serverid="$serverid"} legend: meta - row: 1 - title: p4 read locks held per table (rate/min) target: - expr: sum without (instance, job) (rate(p4_total_read_held_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m])) legend: "{{table}}" - title: p4 read locks waiting (rate/min) target: - expr: sum without (instance, job) (rate(p4_total_read_wait_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m])) legend: "{{table}}" - row: 1 - title: p4 write locks held per table (rate/min) target: - expr: sum without (instance, job) (rate(p4_total_write_held_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m])) legend: "{{table}}" - title: p4 write locks wait per table (rate/min) target: - expr: sum without (instance, job) (rate(p4_total_write_wait_seconds{sdpinst="$sdpinst",serverid="$serverid"}[1m])) legend: "{{table}}" - row: 1 - title: RTV processes waiting for locks target: - expr: p4_rtv_db_lockwait{sdpinst="$sdpinst",serverid="$serverid"} - title: RTV DB I/O record count target: - expr: p4_rtv_db_io_records{sdpinst="$sdpinst",serverid="$serverid"} - row: 1 - title: RTV is checkpoint active target: - expr: p4_rtv_db_ckp_active{sdpinst="$sdpinst",serverid="$serverid"} - title: RTV checkpoint records processed target: - expr: p4_rtv_db_ckp_records{sdpinst="$sdpinst",serverid="$serverid"} - row: 1 - title: RTV replica byte lag target: - expr: p4_rtv_rpl_behind_bytes{sdpinst="$sdpinst",serverid="$serverid"} - title: RTV replica journal lag target: - expr: p4_rtv_rpl_behind_journals{sdpinst="$sdpinst",serverid="$serverid"} - row: 1 - title: RTV active sessions target: - expr: p4_rtv_svr_sessions_active{sdpinst="$sdpinst",serverid="$serverid"} - title: RTV total sessions target: - expr: p4_rtv_svr_sessions_total{sdpinst="$sdpinst",serverid="$serverid"} - row: 1 - title: Processes waiting on read locks target: - expr: p4_locks_db_read{sdpinst="$sdpinst",serverid="$serverid"} - title: Processes waiting on write locks target: - expr: p4_locks_db_write{sdpinst="$sdpinst",serverid="$serverid"} - row: 1 - title: Processes waiting on cliententity read locks target: - expr: p4_locks_cliententity_read{sdpinst="$sdpinst",serverid="$serverid"} - title: Processes waiting on cliententity write locks target: - expr: p4_locks_cliententity_write{sdpinst="$sdpinst",serverid="$serverid"} - row: 1 - title: Processes waiting on meta_read locks target: - expr: p4_locks_meta_read{sdpinst="$sdpinst",serverid="$serverid"} - title: Processes waiting on meta_write target: - expr: p4_locks_meta_write{sdpinst="$sdpinst",serverid="$serverid"} - row: 1 - title: Processes blocked count target: - expr: p4_locks_cmds_blocked{sdpinst="$sdpinst",serverid="$serverid"}