#!/bin/bash # 已在CentOS 6.x上进行测试 # author: digoal # 2015-10 # 权限需求 , OS: root PG: Superuser # 用法 . ./generate_report.sh >/tmp/report.log 2>&1 # 生成报告目录 grep -E "^----->>>|^\|" /tmp/report.log | sed 's/^----->>>---->>>/ /' | sed '1 i\ \ 目录\n\n' | sed '$ a\ \n\n\ \ 正文\n\n' # 请将以下变量修改为与当前环境一致, 并且确保使用这个配置连接任何数据库都不需要输入密码 export PGHOST=127.0.0.1 export PGPORT=1921 export PGDATABASE=postgres export PGUSER=postgres export PGPASSWORD=postgres export PGDATA=/data01/pg_root_1921 export PGHOME=/opt/pgsql export PATH=$PGHOME/bin:$PATH:. export DATE=`date +"%Y%m%d%H%M"` export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH # 记住当前目录 PWD=`pwd` # 获取postgresql日志目录 pg_log_dir=`grep '^\ *[a-z]' $PGDATA/postgresql.conf|awk -F "#" '{print $1}'|grep log_directory|awk -F "=" '{print $2}'` # 检查是否standby is_standby=`psql --pset=pager=off -q -A -t -c 'select pg_is_in_recovery()'` echo " ----- PostgreSQL 巡检报告 ----- " echo " ===== $DATE ===== " if [ $is_standby == 't' ]; then echo " ===== 这是standby节点 ===== " else echo " ===== 这是primary节点 ===== " fi echo "" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 操作系统信息 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 主机名: " hostname -s echo "" echo "----->>>---->>> 以太链路信息: " ip link show echo "" echo "----->>>---->>> IP地址信息: " ip addr show echo "" echo "----->>>---->>> 路由信息: " ip route show echo "" echo "----->>>---->>> 操作系统内核: " uname -a echo "" echo "----->>>---->>> 内存(MB): " free -m echo "" echo "----->>>---->>> CPU: " lscpu echo "" echo "----->>>---->>> 块设备: " lsblk echo "" echo "----->>>---->>> 拓扑: " lstopo-no-graphics echo "" echo "----->>>---->>> 进程树: " pstree -a -A -c -l -n -p -u -U -Z echo "" echo "----->>>---->>> 操作系统配置文件 静态配置信息: " echo "----->>>---->>> /etc/sysctl.conf " grep "^[a-z]" /etc/sysctl.conf echo "" echo "----->>>---->>> /etc/security/limits.conf " grep -v "^#" /etc/security/limits.conf|grep -v "^$" echo "" echo "----->>>---->>> /etc/security/limits.d/*.conf " for dir in `ls /etc/security/limits.d`; do echo "/etc/security/limits.d/$dir : "; grep -v "^#" /etc/security/limits.d/$dir|grep -v "^$"; done echo "" echo "----->>>---->>> /etc/sysconfig/iptables " cat /etc/sysconfig/iptables echo "" echo "----->>>---->>> /etc/fstab " cat /etc/fstab echo "" echo "----->>>---->>> /etc/rc.local " cat /etc/rc.local echo "" echo "----->>>---->>> /etc/selinux/config " cat /etc/selinux/config echo "" echo "----->>>---->>> /boot/grub/grub.conf " cat /boot/grub/grub.conf echo "" echo "----->>>---->>> /var/spool/cron 用户cron配置 " for dir in `ls /var/spool/cron`; do echo "/var/spool/cron/$dir : "; cat /var/spool/cron/$dir; done echo "" echo "----->>>---->>> chkconfig --list " chkconfig --list echo "" echo "----->>>---->>> iptables -L -v -n -t filter 动态配置信息: " iptables -L -v -n -t filter echo "" echo "----->>>---->>> iptables -L -v -n -t nat 动态配置信息: " iptables -L -v -n -t nat echo "" echo "----->>>---->>> iptables -L -v -n -t mangle 动态配置信息: " iptables -L -v -n -t mangle echo "" echo "----->>>---->>> iptables -L -v -n -t raw 动态配置信息: " iptables -L -v -n -t raw echo "" echo "----->>>---->>> sysctl -a 动态配置信息: " sysctl -a echo "" echo "----->>>---->>> mount 动态配置信息: " mount -l echo "" echo "----->>>---->>> selinux 动态配置信息: " getsebool sestatus echo "" echo "----->>>---->>> 建议禁用Transparent Huge Pages (THP): " cat /sys/kernel/mm/transparent_hugepage/enabled cat /sys/kernel/mm/transparent_hugepage/defrag cat /sys/kernel/mm/redhat_transparent_hugepage/enabled cat /sys/kernel/mm/redhat_transparent_hugepage/defrag echo "" echo "----->>>---->>> 硬盘SMART信息(需要root): " smartctl --scan|awk -F "#" '{print $1}' | while read i; do echo -e "\n\nDEVICE $i"; smartctl -a $i; done echo "" echo "----->>>---->>> /var/log/boot.log " cat /var/log/boot.log echo "" echo "----->>>---->>> /var/log/cron(需要root) " cat /var/log/cron echo "" echo "----->>>---->>> /var/log/dmesg " cat /var/log/dmesg echo "" echo "----->>>---->>> /var/log/messages(需要root) " tail -n 500 /var/log/messages echo "" echo "----->>>---->>> /var/log/secure(需要root) " cat /var/log/secure echo "" echo "----->>>---->>> /var/log/wtmp " who -a /var/log/wtmp echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库信息 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 数据库版本: " psql --pset=pager=off -q -c 'select version()' echo "----->>>---->>> 用户已安装的插件版本: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),* from pg_extension' done echo "----->>>---->>> 用户使用了多少种数据类型: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),b.typname,count(*) from pg_attribute a,pg_type b where a.atttypid=b.oid and a.attrelid in (select oid from pg_class where relnamespace not in (select oid from pg_namespace where nspname ~ $$^pg_$$ or nspname=$$information_schema$$)) group by 1,2 order by 3 desc' done echo "----->>>---->>> 用户创建了多少对象: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),rolname,nspname,relkind,count(*) from pg_class a,pg_authid b,pg_namespace c where a.relnamespace=c.oid and a.relowner=b.oid and nspname !~ $$^pg_$$ and nspname<>$$information_schema$$ group by 1,2,3,4 order by 5 desc' done echo "----->>>---->>> 用户对象占用空间的柱状图: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),buk this_buk_no,cnt rels_in_this_buk,pg_size_pretty(min) buk_min,pg_size_pretty(max) buk_max from( select row_number() over (partition by buk order by tsize),tsize,buk,min(tsize) over (partition by buk),max(tsize) over (partition by buk),count(*) over (partition by buk) cnt from ( select pg_relation_size(a.oid) tsize, width_bucket(pg_relation_size(a.oid),tmin-1,tmax+1,10) buk from (select min(pg_relation_size(a.oid)) tmin,max(pg_relation_size(a.oid)) tmax from pg_class a,pg_namespace c where a.relnamespace=c.oid and nspname !~ $$^pg_$$ and nspname<>$$information_schema$$) t, pg_class a,pg_namespace c where a.relnamespace=c.oid and nspname !~ $$^pg_$$ and nspname<>$$information_schema$$ ) t)t where row_number=1;' done echo "----->>>---->>> 当前用户的操作系统定时任务: " echo "I am `whoami`" crontab -l echo "建议: " echo " 仔细检查定时任务的必要性, 以及定时任务的成功与否的评判标准, 以及监控措施. " echo " 请以启动数据库的OS用户执行本脚本. " echo -e "\n" common() { # 进入pg_log工作目录 cd $PGDATA eval cd $pg_log_dir echo "----->>>---->>> 获取pg_hba.conf md5值: " md5sum $PGDATA/pg_hba.conf echo "建议: " echo " 主备md5值一致(判断主备配置文件是否内容一致的一种手段, 或者使用diff)." echo -e "\n" echo "----->>>---->>> 获取pg_hba.conf配置: " grep '^\ *[a-z]' $PGDATA/pg_hba.conf echo "建议: " echo " 主备配置尽量保持一致, 注意trust和password认证方法的危害(password方法 验证时网络传输密码明文, 建议改为md5), 建议除了unix socket可以使用trust以外, 其他都使用md5或者LDAP认证方法." echo " 建议先设置白名单(超级用户允许的来源IP, 可以访问的数据库), 再设置黑名单(不允许超级用户登陆, reject), 再设置白名单(普通应用), 参考pg_hba.conf中的描述. " echo -e "\n" echo "----->>>---->>> 获取postgresql.conf md5值: " md5sum $PGDATA/postgresql.conf echo "建议: " echo " 主备md5值一致(判断主备配置文件是否内容一致的一种手段, 或者使用diff)." echo -e "\n" echo "----->>>---->>> 获取postgresql.conf配置: " grep '^\ *[a-z]' $PGDATA/postgresql.conf|awk -F "#" '{print $1}' echo "建议: " echo " 主备配置尽量保持一致, 配置合理的参数值." echo -e " 建议修改的参数列表如下 ( 假设操作系统内存为128GB, 数据库独占操作系统, 数据库版本9.4.x, 其他版本可能略有不同, 未来再更新进来 ) : echo "" listen_addresses = '0.0.0.0' # 监听所有IPV4地址 port = 1921 # 监听非默认端口 max_connections = 4000 # 最大允许连接数 superuser_reserved_connections = 20 # 为超级用户保留的连接 unix_socket_directories = '.' # unix socket文件目录最好放在$PGDATA中, 确保安全 unix_socket_permissions = 0700 # 确保权限安全 tcp_keepalives_idle = 30 # 间歇性发送TCP心跳包, 防止连接被网络设备中断. tcp_keepalives_interval = 10 tcp_keepalives_count = 10 shared_buffers = 16GB # 数据库自己管理的共享内存大小, 如果用大页, 建议设置为: 内存 - 100*work_mem - autovacuum_max_workers*(autovacuum_work_mem or autovacuum_work_mem) - max_connections*1MB huge_pages = try # 尽量使用大页, 需要操作系统支持, 配置vm.nr_hugepages*2MB大于shared_buffers. maintenance_work_mem = 512MB # 可以加速创建索引, 回收垃圾(假设没有设置autovacuum_work_mem) autovacuum_work_mem = 512MB # 可以加速回收垃圾 shared_preload_libraries = 'auth_delay,passwordcheck,pg_stat_statements,auto_explain' # 建议防止暴力破解, 密码复杂度检测, 开启pg_stat_statements, 开启auto_explain, 参考 http://blog.163.com/digoal@126/blog/static/16387704020149852941586 bgwriter_delay = 10ms # bgwriter process间隔多久调用write接口(注意不是fsync)将shared buffer中的dirty page写到文件系统. bgwriter_lru_maxpages = 1000 # 一个周期最多写多少脏页 max_worker_processes = 20 # 如果要使用worker process, 最多可以允许fork 多少个worker进程. wal_level = logical # 如果将来打算使用logical复制, 最后先配置好, 不需要停机再改. synchronous_commit = off # 如果磁盘的IOPS能力一般, 建议使用异步提交来提高性能, 但是数据库crash或操作系统crash时, 最多可能丢失2*wal_writer_delay时间段产生的事务日志(在wal buffer中). wal_sync_method = open_datasync # 使用pg_test_fsync测试wal所在磁盘的fsync接口, 使用性能好的. wal_buffers = 16MB wal_writer_delay = 10ms checkpoint_segments = 1024 # 等于shared_buffers除以单个wal segment的大小. checkpoint_timeout = 50min checkpoint_completion_target = 0.8 archive_mode = on # 最好先开启, 否则需要重启数据库来修改 archive_command = '/bin/date' # 最好先开启, 否则需要重启数据库来修改, 将来修改为正确的命令例如, test ! -f /home/postgres/archivedir/pg_root/%f && cp %p /home/postgres/archivedir/pg_root/%f max_wal_senders = 32 # 最多允许多少个wal sender进程. wal_keep_segments = 2048 # 在pg_xlog目录中保留的WAL文件数, 根据流复制业务的延迟情况和pg_xlog目录大小来预估. max_replication_slots = 32 # 最多允许多少个复制插槽 hot_standby = on max_standby_archive_delay = 300s # 如果备库要被用于只读, 有大的查询的情况下, 如果遇到conflicts, 可以考虑调整这个值来避免conflict造成cancel query. max_standby_streaming_delay = 300s # 如果备库要被用于只读, 有大的查询的情况下, 如果遇到conflicts, 可以考虑调整这个值来避免conflict造成cancel query. wal_receiver_status_interval = 1s hot_standby_feedback = off # 建议关闭, 如果备库出现long query,可能导致主库频繁的autovacuum(比如出现无法回收被需要的垃圾时) vacuum_defer_cleanup_age = 0 # 建议设置为0,避免主库出现频繁的autovacuum无用功,也许新版本会改进。 random_page_cost = 1.3 # 根据IO能力调整(企业级SSD为例 1.3是个经验值) effective_cache_size = 100GB # 调整为与内存一样大, 或者略小(减去shared_buffer). 用来评估OS PAGE CACHE可以用到的内存大小. log_destination = 'csvlog' logging_collector = on log_truncate_on_rotation = on log_rotation_size = 10MB log_min_duration_statement = 1s log_checkpoints = on log_connections = on log_disconnections = on log_error_verbosity = verbose # 在日志中输出代码位置 log_lock_waits = on log_statement = 'ddl' autovacuum = on log_autovacuum_min_duration = 0 autovacuum_max_workers = 10 # 根据实际频繁变更或删除记录的对象数决定 autovacuum_naptime = 30s # 快速唤醒, 防止膨胀 autovacuum_vacuum_scale_factor = 0.1 # 当垃圾超过比例时, 启动垃圾回收工作进程 autovacuum_analyze_scale_factor = 0.2 autovacuum_freeze_max_age = 1600000000 autovacuum_multixact_freeze_max_age = 1600000000 vacuum_freeze_table_age = 1500000000 vacuum_multixact_freeze_table_age = 1500000000 auth_delay.milliseconds = 5000 # 认证失败, 延迟多少毫秒反馈 auto_explain.log_min_duration = 5000 # 记录超过多少毫秒的SQL当时的执行计划 auto_explain.log_analyze = true auto_explain.log_verbose = true auto_explain.log_buffers = true auto_explain.log_nested_statements = true pg_stat_statements.track_utility=off 建议的操作系统配置(根据实际情况修改) : vi /etc/sysctl.conf # add by digoal.zhou fs.aio-max-nr = 1048576 fs.file-max = 76724600 kernel.core_pattern= /data01/corefiles/core_%e_%u_%t_%s.%p # /data01/corefiles事先建好,权限777 kernel.sem = 4096 2147483647 2147483646 512000 # 信号量, ipcs -l 或 -u 查看,每16个进程一组,每组信号量需要17个信号量。 kernel.shmall = 107374182 # 所有共享内存段相加大小限制(建议内存的80%) kernel.shmmax = 274877906944 # 最大单个共享内存段大小(建议为内存一半), >9.2的版本已大幅降低共享内存的使用 kernel.shmmni = 819200 # 一共能生成多少共享内存段,每个PG数据库集群至少2个共享内存段 net.core.netdev_max_backlog = 10000 net.core.rmem_default = 262144 # The default setting of the socket receive buffer in bytes. net.core.rmem_max = 4194304 # The maximum receive socket buffer size in bytes net.core.wmem_default = 262144 # The default setting (in bytes) of the socket send buffer. net.core.wmem_max = 4194304 # The maximum send socket buffer size in bytes. net.core.somaxconn = 4096 net.ipv4.tcp_max_syn_backlog = 4096 net.ipv4.tcp_keepalive_intvl = 20 net.ipv4.tcp_keepalive_probes = 3 net.ipv4.tcp_keepalive_time = 60 net.ipv4.tcp_mem = 8388608 12582912 16777216 net.ipv4.tcp_fin_timeout = 5 net.ipv4.tcp_synack_retries = 2 net.ipv4.tcp_syncookies = 1 # 开启SYN Cookies。当出现SYN等待队列溢出时,启用cookie来处理,可防范少量的SYN攻击 net.ipv4.tcp_timestamps = 1 # 减少time_wait net.ipv4.tcp_tw_recycle = 0 # 如果=1则开启TCP连接中TIME-WAIT套接字的快速回收,但是NAT环境可能导致连接失败,建议服务端关闭它 net.ipv4.tcp_tw_reuse = 1 # 开启重用。允许将TIME-WAIT套接字重新用于新的TCP连接 net.ipv4.tcp_max_tw_buckets = 262144 net.ipv4.tcp_rmem = 8192 87380 16777216 net.ipv4.tcp_wmem = 8192 65536 16777216 net.nf_conntrack_max = 1200000 net.netfilter.nf_conntrack_max = 1200000 vm.dirty_background_bytes = 409600000 # 系统脏页到达这个值,系统后台刷脏页调度进程 pdflush(或其他) 自动将(dirty_expire_centisecs/100)秒前的脏页刷到磁盘 vm.dirty_expire_centisecs = 3000 # 比这个值老的脏页,将被刷到磁盘。3000表示30秒。 vm.dirty_ratio = 95 # 如果系统进程刷脏页太慢,使得系统脏页超过内存 95 % 时,则用户进程如果有写磁盘的操作(如fsync, fdatasync等调用),则需要主动把系统脏页刷出。 # 有效防止用户进程刷脏页,在单机多实例,并且使用CGROUP限制单实例IOPS的情况下非常有效。 vm.dirty_writeback_centisecs = 100 # pdflush(或其他)后台刷脏页进程的唤醒间隔, 100表示1秒。 vm.extra_free_kbytes = 4096000 vm.min_free_kbytes = 2097152 vm.mmap_min_addr = 65536 vm.overcommit_memory = 0 # 在分配内存时,允许少量over malloc, 如果设置为 1, 则认为总是有足够的内存,内存较少的测试环境可以使用 1 . vm.overcommit_ratio = 90 # 当overcommit_memory = 2 时,用于参与计算允许指派的内存大小。 vm.swappiness = 0 # 关闭交换分区 vm.zone_reclaim_mode = 0 # 禁用 numa, 或者在vmlinux中禁止. net.ipv4.ip_local_port_range = 40000 65535 # 本地自动分配的TCP, UDP端口号范围 # vm.nr_hugepages = 102352 # 建议shared buffer设置超过64GB时 使用大页,页大小 /proc/meminfo Hugepagesize vi /etc/security/limits.conf * soft nofile 1024000 * hard nofile 1024000 * soft nproc unlimited * hard nproc unlimited * soft core unlimited * hard core unlimited * soft memlock unlimited * hard memlock unlimited rm -f /etc/security/limits.d/90-nproc.conf \n " echo "----->>>---->>> 用户或数据库级别定制参数: " psql --pset=pager=off -q -c 'select * from pg_db_role_setting' echo "建议: " echo " 定制参数需要关注, 优先级高于数据库的启动参数和配置文件中的参数, 特别是排错时需要关注. " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库错误日志分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 获取错误日志信息: " cat *.csv | grep -E "^[0-9]" | grep -E "WARNING|ERROR|FATAL|PANIC" | awk -F "," '{print $12" , "$13" , "$14}'|sort|uniq -c|sort -rn echo "建议: " echo " 参考 http://www.postgresql.org/docs/current/static/errcodes-appendix.html ." echo -e "\n" echo "----->>>---->>> 获取连接请求情况: " find . -name "*.csv" -type f -mtime -28 -exec grep "connection authorized" {} +|awk -F "," '{print $2,$3,$5}'|sed 's/\:[0-9]*//g'|sort|uniq -c|sort -n -r echo "建议: " echo " 连接请求非常多时, 请考虑应用层使用连接池, 或者使用pgbouncer连接池. " echo -e "\n" echo "----->>>---->>> 获取认证失败情况: " find . -name "*.csv" -type f -mtime -28 -exec grep "password authentication failed" {} +|awk -F "," '{print $2,$3,$5}'|sed 's/\:[0-9]*//g'|sort|uniq -c|sort -n -r echo "建议: " echo " 认证失败次数很多时, 可能是有用户在暴力破解, 建议使用auth_delay插件防止暴力破解. " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库慢SQL日志分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 慢查询统计: " cat *.csv|awk -F "," '{print $1" "$2" "$3" "$8" "$14}' |grep "duration:"|grep -v "plan:"|awk '{print $1" "$4" "$5" "$6}'|sort|uniq -c|sort -rn echo "建议: " echo " 输出格式(条数,日期,用户,数据库,QUERY,耗时ms). " echo " 慢查询反映执行时间超过log_min_duration_statement的SQL, 可以根据实际情况分析数据库或SQL语句是否有优化空间. " echo "" echo "----->>>---->>> 慢查询分布头10条的执行时间, ms: " cat *.csv|awk -F "," '{print $1" "$2" "$3" "$8" "$14}' |grep "duration:"|grep -v "plan:"|awk '{print $1" "$4" "$5" "$6" "$7" "$8}'|sort -k 6 -n|head -n 10 echo "" echo "----->>>---->>> 慢查询分布尾10条的执行时间, ms: " cat *.csv|awk -F "," '{print $1" "$2" "$3" "$8" "$14}' |grep "duration:"|grep -v "plan:"|awk '{print $1" "$4" "$5" "$6" "$7" "$8}'|sort -k 6 -n|tail -n 10 echo -e "\n" echo "----->>>---->>> auto_explain 分析统计: " cat *.csv|awk -F "," '{print $1" "$2" "$3" "$8" "$14}' |grep "plan:"|grep "duration:"|awk '{print $1" "$4" "$5" "$6}'|sort|uniq -c|sort -rn echo "建议: " echo " 输出格式(条数,日期,用户,数据库,QUERY). " echo " 慢查询反映执行时间超过auto_explain.log_min_duration的SQL, 可以根据实际情况分析数据库或SQL语句是否有优化空间, 分析csvlog中auto_explain的输出可以了解语句超时时的执行计划详情. " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库空间使用分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 输出文件系统剩余空间: " df -m echo "建议: " echo " 注意预留足够的空间给数据库. " echo -e "\n" echo "----->>>---->>> 输出表空间对应目录: " echo $PGDATA ls -la $PGDATA/pg_tblspc/ echo "建议: " echo " 注意表空间如果不是软链接, 注意是否刻意所为, 正常情况下应该是软链接. " echo -e "\n" echo "----->>>---->>> 输出表空间使用情况: " psql --pset=pager=off -q -c 'select spcname,pg_tablespace_location(oid),pg_size_pretty(pg_tablespace_size(oid)) from pg_tablespace order by pg_tablespace_size(oid) desc' echo "建议: " echo " 注意检查表空间所在文件系统的剩余空间, (默认表空间在$PGDATA/base目录下), IOPS分配是否均匀, OS的sysstat包可以观察IO使用率. " echo -e "\n" echo "----->>>---->>> 输出数据库使用情况: " psql --pset=pager=off -q -c 'select datname,pg_size_pretty(pg_database_size(oid)) from pg_database order by pg_database_size(oid) desc' echo "建议: " echo " 注意检查数据库的大小, 是否需要清理历史数据. " echo -e "\n" echo "----->>>---->>> TOP 10 size对象: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),b.nspname,c.relname,c.relkind,pg_size_pretty(pg_relation_size(c.oid)),a.seq_scan,a.seq_tup_read,a.idx_scan,a.idx_tup_fetch,a.n_tup_ins,a.n_tup_upd,a.n_tup_del,a.n_tup_hot_upd,a.n_live_tup,a.n_dead_tup from pg_stat_all_tables a, pg_class c,pg_namespace b where c.relnamespace=b.oid and c.relkind=$$r$$ and a.relid=c.oid order by pg_relation_size(c.oid) desc limit 10' done echo "建议: " echo " 经验值: 单表超过8GB, 并且这个表需要频繁更新 或 删除+插入的话, 建议对表根据业务逻辑进行合理拆分后获得更好的性能, 以及便于对膨胀索引进行维护; 如果是只读的表, 建议适当结合SQL语句进行优化. " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库连接分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 当前活跃度: " psql --pset=pager=off -q -c 'select now(),state,count(*) from pg_stat_activity group by 1,2' echo "建议: " echo " 如果active状态很多, 说明数据库比较繁忙. 如果idle in transaction很多, 说明业务逻辑设计可能有问题. 如果idle很多, 可能使用了连接池, 并且可能没有自动回收连接到连接池的最小连接数. " echo -e "\n" echo "----->>>---->>> 总剩余连接数: " psql --pset=pager=off -q -c 'select max_conn,used,res_for_super,max_conn-used-res_for_super res_for_normal from (select count(*) used from pg_stat_activity) t1,(select setting::int res_for_super from pg_settings where name=$$superuser_reserved_connections$$) t2,(select setting::int max_conn from pg_settings where name=$$max_connections$$) t3' echo "建议: " echo " 给超级用户和普通用户设置足够的连接, 以免不能登录数据库. " echo -e "\n" echo "----->>>---->>> 用户连接数限制: " psql --pset=pager=off -q -c 'select a.rolname,a.rolconnlimit,b.connects from pg_authid a,(select usename,count(*) connects from pg_stat_activity group by usename) b where a.rolname=b.usename order by b.connects desc' echo "建议: " echo " 给用户设置足够的连接数, alter role ... CONNECTION LIMIT . " echo -e "\n" echo "----->>>---->>> 数据库连接限制: " psql --pset=pager=off -q -c 'select a.datname, a.datconnlimit, b.connects from pg_database a,(select datname,count(*) connects from pg_stat_activity group by datname) b where a.datname=b.datname order by b.connects desc' echo "建议: " echo " 给数据库设置足够的连接数, alter database ... CONNECTION LIMIT . " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库性能分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> TOP 5 SQL : total_cpu_time " psql --pset=pager=off -q -x -c 'select c.rolname,b.datname,a.total_time/a.calls per_call_time,a.* from pg_stat_statements a,pg_database b,pg_authid c where a.userid=c.oid and a.dbid=b.oid order by a.total_time desc limit 5' echo "建议: " echo " 检查SQL是否有优化空间, 配合auto_explain插件在csvlog中观察LONG SQL的执行计划是否正确. " echo -e "\n" echo "----->>>---->>> 索引数超过4并且SIZE大于10MB的表: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(), t2.nspname, t1.relname, pg_size_pretty(pg_relation_size(t1.oid)), t3.idx_cnt from pg_class t1, pg_namespace t2, (select indrelid,count(*) idx_cnt from pg_index group by 1 having count(*)>4) t3 where t1.oid=t3.indrelid and t1.relnamespace=t2.oid and pg_relation_size(t1.oid)/1024/1024.0>10 order by t3.idx_cnt desc' done echo "建议: " echo " 索引数量太多, 影响表的增删改性能, 建议检查是否有不需要的索引. " echo -e "\n" echo "----->>>---->>> 上次巡检以来未使用或使用较少的索引: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),t2.schemaname,t2.relname,t2.indexrelname,t2.idx_scan,t2.idx_tup_read,t2.idx_tup_fetch,pg_size_pretty(pg_relation_size(indexrelid)) from pg_stat_all_tables t1,pg_stat_all_indexes t2 where t1.relid=t2.relid and t2.idx_scan<10 and t2.schemaname not in ($$pg_toast$$,$$pg_catalog$$) and indexrelid not in (select conindid from pg_constraint where contype in ($$p$$,$$u$$,$$f$$)) and pg_relation_size(indexrelid)>65536 order by pg_relation_size(indexrelid) desc' done echo "建议: " echo " 建议和应用开发人员确认后, 删除不需要的索引. " echo -e "\n" echo "----->>>---->>> 数据库统计信息, 回滚比例, 命中比例, 数据块读写时间, 死锁, 复制冲突: " psql --pset=pager=off -q -c 'select datname,round(100*(xact_rollback::numeric/(case when xact_commit > 0 then xact_commit else 1 end + xact_rollback)),2)||$$ %$$ rollback_ratio, round(100*(blks_hit::numeric/(case when blks_read>0 then blks_read else 1 end + blks_hit)),2)||$$ %$$ hit_ratio, blk_read_time, blk_write_time, conflicts, deadlocks from pg_stat_database' echo "建议: " echo " 回滚比例大说明业务逻辑可能有问题, 命中率小说明shared_buffer要加大, 数据块读写时间长说明块设备的IO性能要提升, 死锁次数多说明业务逻辑有问题, 复制冲突次数多说明备库可能在跑LONG SQL. " echo -e "\n" echo "----->>>---->>> 检查点, bgwriter 统计信息: " psql --pset=pager=off -q -x -c 'select * from pg_stat_bgwriter' echo "建议: " echo " checkpoint_write_time多说明检查点持续时间长, 检查点过程中产生了较多的脏页. " echo " checkpoint_sync_time代表检查点开始时的shared buffer中的脏页被同步到磁盘的时间, 如果时间过长, 并且数据库在检查点时性能较差, 考虑一下提升块设备的IOPS能力. " echo " buffers_backend_fsync太多说明需要加大shared buffer 或者 减小bgwriter_delay参数. " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库垃圾分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 表引膨胀检查: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -x -c 'SELECT current_database() AS db, schemaname, tablename, reltuples::bigint AS tups, relpages::bigint AS pages, otta, ROUND(CASE WHEN otta=0 OR sml.relpages=0 OR sml.relpages=otta THEN 0.0 ELSE sml.relpages/otta::numeric END,1) AS tbloat, CASE WHEN relpages < otta THEN 0 ELSE relpages::bigint - otta END AS wastedpages, CASE WHEN relpages < otta THEN 0 ELSE bs*(sml.relpages-otta)::bigint END AS wastedbytes, CASE WHEN relpages < otta THEN $$0 bytes$$::text ELSE (bs*(relpages-otta))::bigint || $$ bytes$$ END AS wastedsize, iname, ituples::bigint AS itups, ipages::bigint AS ipages, iotta, ROUND(CASE WHEN iotta=0 OR ipages=0 OR ipages=iotta THEN 0.0 ELSE ipages/iotta::numeric END,1) AS ibloat, CASE WHEN ipages < iotta THEN 0 ELSE ipages::bigint - iotta END AS wastedipages, CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta) END AS wastedibytes, CASE WHEN ipages < iotta THEN $$0 bytes$$ ELSE (bs*(ipages-iotta))::bigint || $$ bytes$$ END AS wastedisize, CASE WHEN relpages < otta THEN CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta::bigint) END ELSE CASE WHEN ipages < iotta THEN bs*(relpages-otta::bigint) ELSE bs*(relpages-otta::bigint + ipages-iotta::bigint) END END AS totalwastedbytes FROM ( SELECT nn.nspname AS schemaname, cc.relname AS tablename, COALESCE(cc.reltuples,0) AS reltuples, COALESCE(cc.relpages,0) AS relpages, COALESCE(bs,0) AS bs, COALESCE(CEIL((cc.reltuples*((datahdr+ma- (CASE WHEN datahdr%ma=0 THEN ma ELSE datahdr%ma END))+nullhdr2+4))/(bs-20::float)),0) AS otta, COALESCE(c2.relname,$$?$$) AS iname, COALESCE(c2.reltuples,0) AS ituples, COALESCE(c2.relpages,0) AS ipages, COALESCE(CEIL((c2.reltuples*(datahdr-12))/(bs-20::float)),0) AS iotta -- very rough approximation, assumes all cols FROM pg_class cc JOIN pg_namespace nn ON cc.relnamespace = nn.oid AND nn.nspname <> $$information_schema$$ LEFT JOIN ( SELECT ma,bs,foo.nspname,foo.relname, (datawidth+(hdr+ma-(case when hdr%ma=0 THEN ma ELSE hdr%ma END)))::numeric AS datahdr, (maxfracsum*(nullhdr+ma-(case when nullhdr%ma=0 THEN ma ELSE nullhdr%ma END))) AS nullhdr2 FROM ( SELECT ns.nspname, tbl.relname, hdr, ma, bs, SUM((1-coalesce(null_frac,0))*coalesce(avg_width, 2048)) AS datawidth, MAX(coalesce(null_frac,0)) AS maxfracsum, hdr+( SELECT 1+count(*)/8 FROM pg_stats s2 WHERE null_frac<>0 AND s2.schemaname = ns.nspname AND s2.tablename = tbl.relname ) AS nullhdr FROM pg_attribute att JOIN pg_class tbl ON att.attrelid = tbl.oid JOIN pg_namespace ns ON ns.oid = tbl.relnamespace LEFT JOIN pg_stats s ON s.schemaname=ns.nspname AND s.tablename = tbl.relname AND s.inherited=false AND s.attname=att.attname, ( SELECT (SELECT current_setting($$block_size$$)::numeric) AS bs, CASE WHEN SUBSTRING(SPLIT_PART(v, $$ $$, 2) FROM $$#"[0-9]+.[0-9]+#"%$$ for $$#$$) IN ($$8.0$$,$$8.1$$,$$8.2$$) THEN 27 ELSE 23 END AS hdr, CASE WHEN v ~ $$mingw32$$ OR v ~ $$64-bit$$ THEN 8 ELSE 4 END AS ma FROM (SELECT version() AS v) AS foo ) AS constants WHERE att.attnum > 0 AND tbl.relkind=$$r$$ GROUP BY 1,2,3,4,5 ) AS foo ) AS rs ON cc.relname = rs.relname AND nn.nspname = rs.nspname LEFT JOIN pg_index i ON indrelid = cc.oid LEFT JOIN pg_class c2 ON c2.oid = i.indexrelid ) AS sml order by wastedbytes desc limit 5' done echo "建议: " echo " 根据浪费的字节数, 设置合适的autovacuum_vacuum_scale_factor, 大表如果频繁的有更新或删除和插入操作, 建议设置较小的autovacuum_vacuum_scale_factor来降低浪费空间. " echo " 同时还需要打开autovacuum, 根据服务器的内存大小, CPU核数, 设置足够大的autovacuum_work_mem 或 autovacuum_max_workers 或 maintenance_work_mem, 以及足够小的 autovacuum_naptime . " echo " 同时还需要分析是否对大数据库使用了逻辑备份pg_dump, 系统中是否经常有长SQL, 长事务. 这些都有可能导致膨胀. " echo " 使用pg_reorg或者vacuum full可以回收膨胀的空间. " echo " 参考: http://blog.163.com/digoal@126/blog/static/1638770402015329115636287/ " echo " otta评估出的表实际需要页数, iotta评估出的索引实际需要页数; " echo " bs数据库的块大小; " echo " tbloat表膨胀倍数, ibloat索引膨胀倍数, wastedpages表浪费了多少个数据块, wastedipages索引浪费了多少个数据块; " echo " wastedbytes表浪费了多少字节, wastedibytes索引浪费了多少字节; " echo -e "\n" echo "----->>>---->>> 索引膨胀检查: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -x -c 'SELECT current_database() AS db, schemaname, tablename, reltuples::bigint AS tups, relpages::bigint AS pages, otta, ROUND(CASE WHEN otta=0 OR sml.relpages=0 OR sml.relpages=otta THEN 0.0 ELSE sml.relpages/otta::numeric END,1) AS tbloat, CASE WHEN relpages < otta THEN 0 ELSE relpages::bigint - otta END AS wastedpages, CASE WHEN relpages < otta THEN 0 ELSE bs*(sml.relpages-otta)::bigint END AS wastedbytes, CASE WHEN relpages < otta THEN $$0 bytes$$::text ELSE (bs*(relpages-otta))::bigint || $$ bytes$$ END AS wastedsize, iname, ituples::bigint AS itups, ipages::bigint AS ipages, iotta, ROUND(CASE WHEN iotta=0 OR ipages=0 OR ipages=iotta THEN 0.0 ELSE ipages/iotta::numeric END,1) AS ibloat, CASE WHEN ipages < iotta THEN 0 ELSE ipages::bigint - iotta END AS wastedipages, CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta) END AS wastedibytes, CASE WHEN ipages < iotta THEN $$0 bytes$$ ELSE (bs*(ipages-iotta))::bigint || $$ bytes$$ END AS wastedisize, CASE WHEN relpages < otta THEN CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta::bigint) END ELSE CASE WHEN ipages < iotta THEN bs*(relpages-otta::bigint) ELSE bs*(relpages-otta::bigint + ipages-iotta::bigint) END END AS totalwastedbytes FROM ( SELECT nn.nspname AS schemaname, cc.relname AS tablename, COALESCE(cc.reltuples,0) AS reltuples, COALESCE(cc.relpages,0) AS relpages, COALESCE(bs,0) AS bs, COALESCE(CEIL((cc.reltuples*((datahdr+ma- (CASE WHEN datahdr%ma=0 THEN ma ELSE datahdr%ma END))+nullhdr2+4))/(bs-20::float)),0) AS otta, COALESCE(c2.relname,$$?$$) AS iname, COALESCE(c2.reltuples,0) AS ituples, COALESCE(c2.relpages,0) AS ipages, COALESCE(CEIL((c2.reltuples*(datahdr-12))/(bs-20::float)),0) AS iotta -- very rough approximation, assumes all cols FROM pg_class cc JOIN pg_namespace nn ON cc.relnamespace = nn.oid AND nn.nspname <> $$information_schema$$ LEFT JOIN ( SELECT ma,bs,foo.nspname,foo.relname, (datawidth+(hdr+ma-(case when hdr%ma=0 THEN ma ELSE hdr%ma END)))::numeric AS datahdr, (maxfracsum*(nullhdr+ma-(case when nullhdr%ma=0 THEN ma ELSE nullhdr%ma END))) AS nullhdr2 FROM ( SELECT ns.nspname, tbl.relname, hdr, ma, bs, SUM((1-coalesce(null_frac,0))*coalesce(avg_width, 2048)) AS datawidth, MAX(coalesce(null_frac,0)) AS maxfracsum, hdr+( SELECT 1+count(*)/8 FROM pg_stats s2 WHERE null_frac<>0 AND s2.schemaname = ns.nspname AND s2.tablename = tbl.relname ) AS nullhdr FROM pg_attribute att JOIN pg_class tbl ON att.attrelid = tbl.oid JOIN pg_namespace ns ON ns.oid = tbl.relnamespace LEFT JOIN pg_stats s ON s.schemaname=ns.nspname AND s.tablename = tbl.relname AND s.inherited=false AND s.attname=att.attname, ( SELECT (SELECT current_setting($$block_size$$)::numeric) AS bs, CASE WHEN SUBSTRING(SPLIT_PART(v, $$ $$, 2) FROM $$#"[0-9]+.[0-9]+#"%$$ for $$#$$) IN ($$8.0$$,$$8.1$$,$$8.2$$) THEN 27 ELSE 23 END AS hdr, CASE WHEN v ~ $$mingw32$$ OR v ~ $$64-bit$$ THEN 8 ELSE 4 END AS ma FROM (SELECT version() AS v) AS foo ) AS constants WHERE att.attnum > 0 AND tbl.relkind=$$r$$ GROUP BY 1,2,3,4,5 ) AS foo ) AS rs ON cc.relname = rs.relname AND nn.nspname = rs.nspname LEFT JOIN pg_index i ON indrelid = cc.oid LEFT JOIN pg_class c2 ON c2.oid = i.indexrelid ) AS sml order by wastedibytes desc limit 5' done echo "建议: " echo " 如果索引膨胀太大, 会影响性能, 建议重建索引, create index CONCURRENTLY ... . " echo -e "\n" echo "----->>>---->>> 垃圾数据: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),schemaname,relname,n_dead_tup from pg_stat_all_tables where n_live_tup>0 and n_dead_tup/n_live_tup>0.2 and schemaname not in ($$pg_toast$$,$$pg_catalog$$) order by n_dead_tup desc limit 5' done echo "建议: " echo " 通常垃圾过多, 可能是因为无法回收垃圾, 或者回收垃圾的进程繁忙或没有及时唤醒, 或者没有开启autovacuum, 或在短时间内产生了大量的垃圾 . " echo " 可以等待autovacuum进行处理, 或者手工执行vacuum table . " echo -e "\n" echo "----->>>---->>> 未引用的大对象: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do vacuumlo -n $db -w echo "" done echo "建议: " echo " 如果大对象没有被引用时, 建议删除, 否则就类似于内存泄露, 使用vacuumlo可以删除未被引用的大对象, 例如: vacuumlo -l 1000 $db -w . " echo " 应用开发时, 注意及时删除不需要使用的大对象, 使用lo_unlink 或 驱动对应的API . " echo " 参考 http://www.postgresql.org/docs/9.4/static/largeobjects.html " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库年龄分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 数据库年龄: " psql --pset=pager=off -q -c 'select datname,age(datfrozenxid),2^31-age(datfrozenxid) age_remain from pg_database order by age(datfrozenxid) desc' echo "建议: " echo " 数据库的年龄正常情况下应该小于vacuum_freeze_table_age, 如果剩余年龄小于5亿, 建议人为干预, 将LONG SQL或事务杀掉后, 执行vacuum freeze . " echo -e "\n" echo "----->>>---->>> 表年龄: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),rolname,nspname,relkind,relname,age(relfrozenxid),2^31-age(relfrozenxid) age_remain from pg_authid t1 join pg_class t2 on t1.oid=t2.relowner join pg_namespace t3 on t2.relnamespace=t3.oid where t2.relkind in ($$t$$,$$r$$) order by age(relfrozenxid) desc limit 5' done echo "建议: " echo " 表的年龄正常情况下应该小于vacuum_freeze_table_age, 如果剩余年龄小于5亿, 建议人为干预, 将LONG SQL或事务杀掉后, 执行vacuum freeze . " echo -e "\n" echo "----->>>---->>> 长事务, 2PC: " psql --pset=pager=off -q -x -c 'select datname,usename,query,xact_start,now()-xact_start xact_duration,query_start,now()-query_start query_duration,state from pg_stat_activity where state<>$$idle$$ and (backend_xid is not null or backend_xmin is not null) and now()-xact_start > interval $$30 min$$ order by xact_start' psql --pset=pager=off -q -x -c 'select name,statement,prepare_time,now()-prepare_time,parameter_types,from_sql from pg_prepared_statements where now()-prepare_time > interval $$30 min$$ order by prepare_time' echo "建议: " echo " 长事务过程中产生的垃圾, 无法回收, 建议不要在数据库中运行LONG SQL, 或者错开DML高峰时间去运行LONG SQL. 2PC事务一定要记得尽快结束掉, 否则可能会导致数据库膨胀. " echo " 参考: http://blog.163.com/digoal@126/blog/static/1638770402015329115636287/ " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库XLOG, 流复制状态分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 是否开启归档, 自动垃圾回收: " psql --pset=pager=off -q -c 'select name,setting from pg_settings where name in ($$archive_mode$$,$$autovacuum$$,$$archive_command$$)' echo "建议: " echo " 建议开启自动垃圾回收, 开启归档. " echo -e "\n" echo "----->>>---->>> 归档统计信息: " psql --pset=pager=off -q -c 'select pg_xlogfile_name(pg_current_xlog_location()) now_xlog, * from pg_stat_archiver' echo "建议: " echo " 如果当前的XLOG文件和最后一个归档失败的XLOG文件之间相差很多个文件, 建议尽快排查归档失败的原因, 以便修复, 否则pg_xlog目录可能会撑爆. " echo -e "\n" echo "----->>>---->>> 流复制统计信息: " psql --pset=pager=off -q -x -c 'select pg_xlog_location_diff(pg_current_xlog_location(),flush_location), * from pg_stat_replication' echo "建议: " echo " 关注流复制的延迟, 如果延迟非常大, 建议排查网络带宽, 以及本地读xlog的性能, 远程写xlog的性能. " echo -e "\n" echo "----->>>---->>> 流复制插槽: " psql --pset=pager=off -q -c 'select pg_xlog_location_diff(pg_current_xlog_location(),restart_lsn), * from pg_replication_slots' echo "建议: " echo " 如果restart_lsn和当前XLOG相差非常大的字节数, 需要排查slot的订阅者是否能正常接收XLOG, 或者订阅者是否正常. 长时间不将slot的数据取走, pg_xlog目录可能会撑爆. " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 数据库安全或潜在风险分析 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 密码泄露检查: " echo " 检查 ~/.psql_history : " grep -i "password" ~/.psql_history|grep -i -E "role|group|user" echo "" echo " 检查 *.csv : " cat *.csv | grep -E "^[0-9]" | grep -i -r -E "role|group|user" |grep -i "password"|grep -i -E "create|alter" echo "" echo " 检查 $PGDATA/recovery.* : " grep -i "password" ../recovery.* echo "" echo " 检查 pg_stat_statements : " psql --pset=pager=off -c 'select query from pg_stat_statements where (query ~* $$group$$ or query ~* $$user$$ or query ~* $$role$$) and query ~* $$password$$' echo " 检查 pg_authid : " psql --pset=pager=off -q -c 'select * from pg_authid where rolpassword !~ $$^md5$$ or length(rolpassword)<>35' echo " 检查 pg_user_mappings, pg_views : " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -c 'select current_database(),* from pg_user_mappings where umoptions::text ~* $$password$$' psql -d $db --pset=pager=off -c 'select current_database(),* from pg_views where definition ~* $$password$$ and definition ~* $$dblink$$' done echo "建议: " echo " 如果以上输出显示密码已泄露, 尽快修改, 并通过参数避免密码又被记录到以上文件中(psql -n) (set log_statement='none'; set log_min_duration_statement=-1; set log_duration=off; set pg_stat_statements.track_utility=off;) . " echo " 明文密码不安全, 建议使用create|alter role ... encrypted password. " echo " 在fdw, dblink based view中不建议使用密码明文. " echo " 在recovery.*的配置中不要使用密码, 不安全, 可以使用.pgpass配置密码 . " echo -e "\n" echo "----->>>---->>> 简单密码检查: " echo " 1. 检查已有密码是否简单, 从crackdb库提取密码字典, 挨个检查 : " echo " 检查 md5('$pwd'||'$username')是否与pg_authid.rolpassword匹配 : " echo " 匹配则说明用户使用了简单密码 : " echo "" echo " 2. 事前检查参考 http://blog.163.com/digoal@126/blog/static/16387704020149852941586" echo -e "\n" echo "----->>>---->>> 用户密码到期时间: " psql --pset=pager=off -q -c 'select rolname,rolvaliduntil from pg_authid order by rolvaliduntil' echo "建议: " echo " 到期后, 用户将无法登陆, 记得修改密码, 同时将密码到期时间延长到某个时间或无限时间, alter role ... VALID UNTIL 'timestamp' . " echo -e "\n" echo "----->>>---->>> SQL注入风险分析: " cat *.csv | grep -E "^[0-9]" | grep exec_simple_query |awk -F "," '{print $2" "$3" "$5" "$NF}'|sed 's/\:[0-9]*//g'|sort|uniq -c|sort -n -r echo "建议: " echo " 调用exec_simple_query有风险, 允许多个SQL封装在一个接口中调用, 建议程序使用绑定变量规避SQL注入风险, 或者程序端使用SQL注入过滤插件. " echo -e "\n" echo "----->>>---->>> 普通用户对象上的规则安全检查: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -c 'select current_database(),a.schemaname,a.tablename,a.rulename,a.definition from pg_rules a,pg_namespace b,pg_class c,pg_authid d where a.schemaname=b.nspname and a.tablename=c.relname and d.oid=c.relowner and not d.rolsuper union all select current_database(),a.schemaname,a.viewname,a.viewowner,a.definition from pg_views a,pg_namespace b,pg_class c,pg_authid d where a.schemaname=b.nspname and a.viewname=c.relname and d.oid=c.relowner and not d.rolsuper' done echo "建议: " echo " 防止普通用户在规则中设陷阱, 注意有危险的security invoker的函数调用, 超级用户可能因为规则触发后误调用这些危险函数(以invoker角色). " echo " 参考 http://blog.163.com/digoal@126/blog/static/16387704020155131217736/ " echo -e "\n" echo "----->>>---->>> 普通用户自定义函数安全检查: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -c 'select current_database(),b.rolname,c.nspname,a.proname from pg_proc a,pg_authid b,pg_namespace c where a.proowner=b.oid and a.pronamespace=c.oid and not b.rolsuper and not a.prosecdef' done echo "建议: " echo " 防止普通用户在函数中设陷阱, 注意有危险的security invoker的函数调用, 超级用户可能因为触发器触发后误调用这些危险函数(以invoker角色). " echo " 参考 http://blog.163.com/digoal@126/blog/static/16387704020155131217736/ " echo -e "\n" echo "----->>>---->>> unlogged table 和 哈希索引: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),t3.rolname,t2.nspname,t1.relname from pg_class t1,pg_namespace t2,pg_authid t3 where t1.relnamespace=t2.oid and t1.relowner=t3.oid and t1.relpersistence=$$u$$' psql -d $db --pset=pager=off -q -c 'select current_database(),pg_get_indexdef(oid) from pg_class where relkind=$$i$$ and pg_get_indexdef(oid) ~ $$USING hash$$' done echo "建议: " echo " unlogged table和hash index不记录XLOG, 无法使用流复制或者log shipping的方式复制到standby节点, 如果在standby节点执行某些SQL, 可能导致报错或查不到数据. " echo " 在数据库CRASH后无法修复unlogged table和hash index, 不建议使用. " echo " PITR对unlogged table和hash index也不起作用. " echo -e "\n" echo "----->>>---->>> 剩余可使用次数不足1000万次的序列检查: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off <>>---->>> 触发器, 事件触发器: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),relname,tgname,proname,tgenabled from pg_trigger t1,pg_class t2,pg_proc t3 where t1.tgfoid=t3.oid and t1.tgrelid=t2.oid' psql -d $db --pset=pager=off -q -c 'select current_database(),rolname,proname,evtname,evtevent,evtenabled,evttags from pg_event_trigger t1,pg_proc t2,pg_authid t3 where t1.evtfoid=t2.oid and t1.evtowner=t3.oid' done echo "建议: " echo " 请管理员注意触发器和事件触发器的必要性. " echo -e "\n" echo "----->>>---->>> 检查是否使用了a-z 0-9 _ 以外的字母作为对象名: " psql --pset=pager=off -q -c 'select distinct datname from (select datname,regexp_split_to_table(datname,$$$$) word from pg_database) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95' for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select current_database(),relname,relkind from (select relname,relkind,regexp_split_to_table(relname,$$$$) word from pg_class) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95 group by 1,2,3' psql -d $db --pset=pager=off -q -c 'select current_database(), typname from (select typname,regexp_split_to_table(typname,$$$$) word from pg_type) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95 group by 1,2' psql -d $db --pset=pager=off -q -c 'select current_database(), proname from (select proname,regexp_split_to_table(proname,$$$$) word from pg_proc where proname !~ $$^RI_FKey_$$) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95 group by 1,2' psql -d $db --pset=pager=off -q -c 'select current_database(),nspname,relname,attname from (select nspname,relname,attname,regexp_split_to_table(attname,$$$$) word from pg_class a,pg_attribute b,pg_namespace c where a.oid=b.attrelid and a.relnamespace=c.oid ) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95 group by 1,2,3,4' done echo "建议: " echo " 建议任何identify都只使用 a-z, 0-9, _ (例如表名, 列名, 视图名, 函数名, 类型名, 数据库名, schema名, 物化视图名等等). " echo " identify 用法 https://yq.aliyun.com/articles/52883 . " echo " https://www.postgresql.org/docs/9.5/static/sql-keywords-appendix.html . " echo " https://www.postgresql.org/docs/9.5/static/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS . " echo -e "\n" echo "----->>>---->>> 锁等待: " psql -x --pset=pager=off < w.pid ) ), t_unionall as ( select r.* from t_overlap r union all select w.* from t_wait w ) select locktype,datname,relation::regclass,page,tuple,virtualxid,transactionid::text,classid::regclass,objid,objsubid, string_agg( 'Pid: '||case when pid is null then 'NULL' else pid::text end||chr(10)|| 'Lock_Granted: '||case when granted is null then 'NULL' else granted::text end||' , Mode: '||case when mode is null then 'NULL' else mode::text end||' , FastPath: '||case when fastpath is null then 'NULL' else fastpath::text end||' , VirtualTransaction: '||case when virtualtransaction is null then 'NULL' else virtualtransaction::text end||' , Session_State: '||case when state is null then 'NULL' else state::text end||chr(10)|| 'Username: '||case when usename is null then 'NULL' else usename::text end||' , Database: '||case when datname is null then 'NULL' else datname::text end||' , Client_Addr: '||case when client_addr is null then 'NULL' else client_addr::text end||' , Client_Port: '||case when client_port is null then 'NULL' else client_port::text end||' , Application_Name: '||case when application_name is null then 'NULL' else application_name::text end||chr(10)|| 'Xact_Start: '||case when xact_start is null then 'NULL' else xact_start::text end||' , Query_Start: '||case when query_start is null then 'NULL' else query_start::text end||' , Xact_Elapse: '||case when (now()-xact_start) is null then 'NULL' else (now()-xact_start)::text end||' , Query_Elapse: '||case when (now()-query_start) is null then 'NULL' else (now()-query_start)::text end||chr(10)|| 'SQL (Current SQL in Transaction): '||chr(10)|| case when query is null then 'NULL' else query::text end, chr(10)||'--------'||chr(10) order by ( case mode when 'INVALID' then 0 when 'AccessShareLock' then 1 when 'RowShareLock' then 2 when 'RowExclusiveLock' then 3 when 'ShareUpdateExclusiveLock' then 4 when 'ShareLock' then 5 when 'ShareRowExclusiveLock' then 6 when 'ExclusiveLock' then 7 when 'AccessExclusiveLock' then 8 else 0 end ) desc, (case when granted then 0 else 1 end) ) as lock_conflict from t_unionall group by locktype,datname,relation,page,tuple,virtualxid,transactionid::text,classid,objid,objsubid ; EOF echo "建议: " echo " 锁等待状态, 反映业务逻辑的问题或者SQL性能有问题, 建议深入排查持锁的SQL. " echo -e "\n" echo "----->>>---->>> 继承关系检查: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -q -c 'select inhrelid::regclass,inhparent::regclass,inhseqno from pg_inherits order by 2,3' done echo "建议: " echo " 如果使用继承来实现分区表, 注意分区表的触发器中逻辑是否正常, 对于时间模式的分区表是否需要及时加分区, 修改触发器函数 . " echo " 建议继承表的权限统一, 如果权限不一致, 可能导致某些用户查询时权限不足. " echo -e "\n" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 重置统计信息 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 重置统计信息: " for db in `psql --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'` do psql -d $db --pset=pager=off -c 'select pg_stat_reset()' done psql --pset=pager=off -c 'select pg_stat_reset_shared($$bgwriter$$)' psql --pset=pager=off -c 'select pg_stat_reset_shared($$archiver$$)' echo "----->>>---->>> 重置pg_stat_statements统计信息: " psql --pset=pager=off -q -A -c 'select pg_stat_statements_reset()' } # common function end primary() { echo "----->>>---->>> 获取recovery.done md5值: " md5sum $PGDATA/recovery.done echo "建议: " echo " 主备md5值一致(判断主备配置文件是否内容一致的一种手段, 或者使用diff)." echo -e "\n" echo "----->>>---->>> 获取recovery.done配置: " grep '^\ *[a-z]' $PGDATA/recovery.done|awk -F "#" '{print $1}' echo "建议: " echo " 在primary_conninfo中不要配置密码, 容易泄露. 建议为流复制用户创建replication角色的用户, 并且配置pg_hba.conf只允许需要的来源IP连接. " echo -e "\n" } # primary function end standby() { echo "----->>>---->>> 获取recovery.conf md5值: " md5sum $PGDATA/recovery.conf echo "建议: " echo " 主备md5值一致(判断主备配置文件是否内容一致的一种手段, 或者使用diff)." echo -e "\n" echo "----->>>---->>> 获取recovery.conf配置: " grep '^\ *[a-z]' $PGDATA/recovery.conf|awk -F "#" '{print $1}' echo "建议: " echo " 在primary_conninfo中不要配置密码, 容易泄露. 建议为流复制用户创建replication角色的用户, 并且配置pg_hba.conf只允许需要的来源IP连接. " echo -e "\n" } # standby function end adds() { echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "| 附加信息 |" echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|" echo "" echo "----->>>---->>> 附件1 : `date -d '-1 day' +"%Y-%m-%d"` 操作系统sysstat收集的统计信息 " sar -A -f /var/log/sa/sa`date -d '-1 day' +%d` echo -e "\n" echo "----->>>---->>> 其他建议: " echo " 其他建议的巡检项: " echo " HA 状态是否正常, 例如检查HA程序, 检查心跳表的延迟. " echo " sar io, load, ...... " echo " 巡检结束后, 清理csv日志 " } # adds function end if [ $is_standby == 't' ]; then standby else primary fi common adds cd $pwd return 0 # 备注 # csv日志分析需要优化 # 某些操作需要root