#!/usr/bin/env bash # apache-log-stats.bash (https://github.com/iliaross/script-stash) # Copyright Ilia Ross # Licensed under the MIT License # # This script parses Apache access and error logs to extract useful statistics. # # Description: # Streaming parser for Apache access and error logs. Extracts top IPs, URLs, # status codes, methods, 404s, 5xx errors, busiest hours, bandwidth stats. # Error logs show levels, modules, client IPs, AH codes, and recent critical # errors. Supports rotated .gz/.tar.gz files. Optional GeoIP and progress meter. # # Usage: # ./apache-log-stats.bash -i [options] # ./apache-log-stats.bash -i [-i ...] [options] # ./apache-log-stats.bash -i -R [n] # ./apache-log-stats.bash -i --user-agents --progress # # Examples: # # Parse access log (auto-detected) # ./apache-log-stats.bash -i /var/log/apache2/access.log # ./apache-log-stats.bash -i site1_access_log -i site2_access_log # # # Parse with all rotated logs included # ./apache-log-stats.bash -i /var/log/virtualmin/site_access_log -R # # # Include only newest 3 rotated logs # ./apache-log-stats.bash -i /var/log/virtualmin/site_access_log -R 3 # # # Parse error log explicitly # ./apache-log-stats.bash -i /var/log/apache2/error.log -t error # # # Disable GeoIP and colors (for scripting) # ./apache-log-stats.bash -i /var/log/apache2/access.log --no-geoip --no-color # # Exit codes: # 0 success # 1 error (usage, missing deps, file not found) # 2 no lines parsed (empty log stream) set -euo pipefail umask 077 shopt -s nullglob # Color output on/off toggle set by init_color USE_COLOR=0 # Enable colors only when stdout is a TTY and NO_COLOR is not set init_color() { if [ -t 1 ] && [ -z "${NO_COLOR:-}" ]; then USE_COLOR=1 fi } # Print a string in a given color or plain if colors are off color() { local c="$1" s="$2" if [ "$USE_COLOR" -eq 0 ]; then printf '%s' "$s" return fi local code="" case "$c" in reset) code=0 ;; bold) code=1 ;; dim) code=2 ;; red) code=91 ;; green) code=92 ;; yellow) code=93 ;; blue) code=94 ;; magenta) code=95 ;; cyan) code=96 ;; gray) code=90 ;; *) printf '%s' "$s"; return ;; esac printf '\033[%sm%s\033[0m' "$code" "$s" } # Print a boxed section header section() { local title="$1" local bar bar=$(printf '%*s' $(( ${#title} + 4 )) '' | tr ' ' '-') printf '%s\n%s\n%s\n' \ "$(color gray "$bar")" \ "$(color bold "| $title |")" \ "$(color gray "$bar")" } # Print a simple subsection header subsection() { local title="$1" printf '\n%s\n' "$(color yellow "$title")" printf '%s\n' \ "$(color gray "$(printf '%*s' ${#title} '' | tr ' ' '-')")" } # Right-pad to a fixed width, then apply color (keeps columns aligned) pad_color() { local width="$1" col="$2" s="$3" local padded padded=$(printf "%*s" "$width" "$s") color "$col" "$padded" } # Left-pad to a fixed width, then apply color (keeps columns aligned) pad_color_left() { local width="$1" col="$2" s="$3" local padded padded=$(printf "%-*s" "$width" "$s") color "$col" "$padded" } # Require a command to exist, else exit with a message need_cmd() { local c="$1" if ! command -v "$c" &>/dev/null; then printf '%s\n' "$(color red "Error: missing command: $c")" >&2 exit 1 fi } # Return success if a command exists have_cmd() { command -v "$1" &>/dev/null } # Detect GNU stat vs BSD stat (different flags) is_gnu_stat() { stat --version &>/dev/null } # Get file size in bytes, portable across GNU/BSD stat file_size() { local file="$1" if is_gnu_stat; then stat -c%s "$file" 2>/dev/null else stat -f%z "$file" 2>/dev/null fi } # Best-effort uncompressed size for compressed files, without full # decompression; returns empty if unknown file_uncompressed_size() { local f="$1" # .gz (also works for gzip-compressed tar archives like .tar.gz and .tgz) if [[ "$f" =~ \.gz$|\.tgz$ ]]; then local csz isz csz=$(file_size "$f" 2>/dev/null || printf '0') isz=$( tail -c 4 "$f" 2>/dev/null \ | od -An -tu4 -N4 2>/dev/null \ | awk '{print $1}' ) # gzip footer size wraps at 2^32, so "un-wrap" if it looks # wrapped; for logs, uncompressed should not be smaller than # compressed if [ -n "$isz" ] && [ "$isz" -gt 0 ] && [ "$csz" -gt 0 ] \ && [ "$isz" -lt "$csz" ] then local wrap=4294967296 local i=0 while [ "$isz" -lt "$csz" ] && [ "$i" -lt 16 ]; do isz=$(( isz + wrap )) i=$(( i + 1 )) done fi printf '%s' "${isz:-}" return fi # .xz / .txz: use machine-readable output, grab "totals" uncompressed bytes if [[ "$f" =~ \.xz$|\.txz$ ]] && have_cmd xz; then xz --robot -l "$f" 2>/dev/null \ | awk -F'\t' '$1=="totals" {print $5; exit}' return fi # zip: unzip -l summary line if [[ "$f" =~ \.zip$ ]] && have_cmd unzip; then unzip -l "$f" 2>/dev/null | awk ' { v=$1; gsub(/,/, "", v) if (v ~ /^[0-9]+$/ && ($NF=="file" || $NF=="files")) { print v; exit } } ' return fi printf '' } # Get file mtime as a readable timestamp, portable across GNU/BSD stat file_mtime() { local file="$1" if is_gnu_stat; then stat -c "%y" "$file" 2>/dev/null | cut -d'.' -f1 else stat -f "%Sm" -t "%Y-%m-%d %H:%M:%S" "$file" 2>/dev/null fi } # Get file mtime as epoch seconds (used for sorting rotated logs) file_mtime_epoch() { local file="$1" if is_gnu_stat; then stat -c "%Y" "$file" 2>/dev/null || printf '0' else stat -f "%m" "$file" 2>/dev/null || printf '0' fi } # Human-readable size for file sizes (prefers numfmt) human_bytes() { local n="$1" if have_cmd numfmt; then numfmt --to=si --format="%.1f" "$n" 2>/dev/null \ | sed 's/\.0\([a-zA-Z]\)/\1/' return fi printf '%s' "$n" } # Human-readable count (prefers numfmt) human_count() { local n="$1" if have_cmd numfmt; then numfmt --to=si --format="%.1f" "$n" 2>/dev/null \ | sed 's/\.0\([a-zA-Z]\)/\1/' \ | tr 'K' 'k' return fi if [ "$n" -lt 1000 ]; then printf '%s' "$n" elif [ "$n" -lt 1000000 ]; then printf '%sk' $(( (n + 500) / 1000 )) elif [ "$n" -lt 1000000000 ]; then printf '%sM' $(( (n + 500000) / 1000000 )) else printf '%sG' $(( (n + 500000000) / 1000000000 )) fi } # Remove a temporary directory (used with trap) cleanup_tmp() { local d="$1" [ -n "$d" ] && [ -d "$d" ] && rm -rf "$d" } # Print help and exit usage() { cat >&2 <<-EOF Usage: $(basename "$0") -i [-i ...] [options] Options: -i, --input Log file to parse (repeatable; preserves order) -t, --type access | error | auto (default: auto) -n, --number Number of rows to show (default: 25) -R, --rotated [n] Include rotated siblings per input: -R include all rotated logs -R n include newest n rotated logs --user-agents Show user-agent stats (slower; default: off) --progress Show progress meter (default: auto) --no-geoip Disable GeoIP output --no-color Disable colors -h, --help Show help Exit codes: 0 success 2 no lines parsed 1 error EOF exit 1 } # Guess log type from filename guess_type_from_name() { local f="$1" case "$f" in *_access_log*|*access.log*|*access_log*) printf 'access' ;; *_error_log*|*error.log*|*error_log*) printf 'error' ;; *) printf '' ;; esac } # Guess log type by sniffing the first line (works for plain/gz/tar.gz) guess_type_from_content() { local f="$1" local line="" if [[ "$f" =~ \.tar\.gz$|\.tgz$ ]]; then line=$(tar -xOzf "$f" 2>/dev/null | head -n1 || true) elif [[ "$f" =~ \.tar\.bz2$|\.tbz2$ ]]; then line=$(tar -xOjf "$f" 2>/dev/null | head -n1 || true) elif [[ "$f" =~ \.tar\.xz$|\.txz$ ]]; then line=$(tar -xOJf "$f" 2>/dev/null | head -n1 || true) elif [[ "$f" =~ \.tar\.zst$|\.tzst$ ]] && tar --help 2>/dev/null | grep -q -- '--zstd'; then line=$(tar --zstd -xOf "$f" 2>/dev/null | head -n1 || true) elif [[ "$f" =~ \.gz$ ]]; then line=$(gzip -cd "$f" 2>/dev/null | head -n1 || true) elif [[ "$f" =~ \.bz2$ ]] && have_cmd bzip2; then line=$(bzip2 -dc "$f" 2>/dev/null | head -n1 || true) elif [[ "$f" =~ \.xz$ ]] && have_cmd xz; then line=$(xz -dc "$f" 2>/dev/null | head -n1 || true) elif [[ "$f" =~ \.zst$ ]] && have_cmd zstd; then line=$(zstd -dc "$f" 2>/dev/null | head -n1 || true) elif [[ "$f" =~ \.zip$ ]] && have_cmd unzip; then line=$(unzip -p "$f" 2>/dev/null | head -n1 || true) else line=$(head -n1 "$f" 2>/dev/null || true) fi if [[ "$line" == *\"GET\ *HTTP/*\"* ]] \ || [[ "$line" == *\"POST\ *HTTP/*\"* ]] \ || [[ "$line" == *\"HEAD\ *HTTP/*\"* ]]; then printf 'access' return fi if [[ "$line" == \[*\]\ \[*\]* ]]; then printf 'error' return fi printf '' } # Collect a main log plus rotated siblings, newest-first selection by mtime. collect_files_for_input() { local input="$1" local rotated_depth="$2" local files=() files+=("$input") if [ "$rotated_depth" -eq 0 ]; then printf '%s\n' "${files[@]}" return fi local dir base f dir=$(dirname "$input") base=$(basename "$input") local -a rotated=() for f in \ "$dir/$base"-*.gz \ "$dir/$base"-*.bz2 \ "$dir/$base"-*.xz \ "$dir/$base"-*.zst \ "$dir/$base"-*.zip \ "$dir/$base"-*.tar.gz \ "$dir/$base"-*.tgz \ "$dir/$base"-*.tar.bz2 \ "$dir/$base"-*.tbz2 \ "$dir/$base"-*.tar.xz \ "$dir/$base"-*.txz \ "$dir/$base"-*.tar.zst \ "$dir/$base"-*.tzst \ "$dir/$base".[0-9]* do [ -e "$f" ] && rotated+=("$f") done if [ "${#rotated[@]}" -eq 0 ]; then printf '%s\n' "${files[@]}" return fi if [ "$rotated_depth" -eq -1 ]; then while IFS= read -r f; do files+=("$f") done < <( for f in "${rotated[@]}"; do printf '%s\t%s\n' \ "$(file_mtime_epoch "$f")" "$f" done | sort -rn | awk -F'\t' '{print $2}' ) else while IFS= read -r f; do files+=("$f") done < <( for f in "${rotated[@]}"; do printf '%s\t%s\n' \ "$(file_mtime_epoch "$f")" "$f" done | sort -rn | head -n "$rotated_depth" \ | awk -F'\t' '{print $2}' ) fi printf '%s\n' "${files[@]}" } # Stream a file to stdout (plain/gz/tar.gz), without writing temp files; uses # pigz if available for faster multi-core gzip decompression stream_file() { local f="$1" # tar archives if [[ "$f" =~ \.tar\.gz$|\.tgz$ ]]; then if have_cmd pigz; then tar --use-compress-program=pigz -xOf "$f" 2>/dev/null else tar -xOzf "$f" 2>/dev/null fi return fi if [[ "$f" =~ \.tar\.bz2$|\.tbz2$ ]]; then tar -xOjf "$f" 2>/dev/null return fi if [[ "$f" =~ \.tar\.xz$|\.txz$ ]]; then tar -xOJf "$f" 2>/dev/null return fi if [[ "$f" =~ \.tar\.zst$|\.tzst$ ]]; then if tar --help 2>/dev/null | grep -q -- '--zstd'; then tar --zstd -xOf "$f" 2>/dev/null elif have_cmd zstd; then zstd -dc "$f" 2>/dev/null | tar -xOf - 2>/dev/null fi return fi # single-file compression if [[ "$f" =~ \.gz$ ]]; then if have_cmd pigz; then pigz -dc "$f" 2>/dev/null else gzip -cd "$f" 2>/dev/null fi return fi if [[ "$f" =~ \.bz2$ ]]; then bzip2 -dc "$f" 2>/dev/null return fi if [[ "$f" =~ \.xz$ ]]; then xz -dc "$f" 2>/dev/null return fi if [[ "$f" =~ \.zst$ ]]; then zstd -dc "$f" 2>/dev/null return fi if [[ "$f" =~ \.zip$ ]]; then unzip -p "$f" 2>/dev/null return fi # plain (including .1 from delaycompress) cat "$f" 2>/dev/null } # Print a list of input logs with sizes and mtimes; plain logs show a single # size on disk, whereas compressed logs show uncompressed size on the left only # if known, and on-disk size in brackets on the right print_file_list() { local -a files=("$@") local total_disk=0 local total_un=0 local un_partial=0 local saw_comp=0 local f sz usz shown left right section "Input files" for f in "${files[@]}"; do shown=$(basename "$f") if [ ! -f "$f" ]; then printf " %s %s\n" \ "$(color yellow "(missing)")" \ "$(color dim "$shown")" continue fi sz=$(file_size "$f" || printf '0') total_disk=$(( total_disk + sz )) right="" # Known compressed extensions; everything else is treated as plain if [[ "$f" =~ \.(gz|bz2|xz|zst|zip|tgz|tbz2|txz|tzst)$ ]] \ || [[ "$f" =~ \.tar\.(gz|bz2|xz|zst)$ ]] then saw_comp=1 usz=$(file_uncompressed_size "$f" 2>/dev/null || true) # Only show uncompressed size if it is a positive integer left="" if [[ "${usz:-}" =~ ^[0-9]+$ ]] && [ "$usz" -gt 0 ]; then left=$(human_bytes "$usz") total_un=$(( total_un + usz )) else un_partial=1 fi right="[$(human_bytes "$sz")]" printf " %s %s %s %s\n" \ "$(pad_color 8 cyan "$left")" \ "$(color dim "$(file_mtime "$f")")" \ "$(color dim "$shown")" \ "$(color dim "$right")" else # Plain file: uncompressed == on-disk left=$(human_bytes "$sz") total_un=$(( total_un + sz )) printf " %s %s %s\n" \ "$(pad_color 8 cyan "$left")" \ "$(color dim "$(file_mtime "$f")")" \ "$(color dim "$shown")" fi done # Totals: always show on-disk, optionally show raw total in brackets local disk_txt raw_txt disk_txt=$(human_bytes "$total_disk") if [ "$saw_comp" -eq 1 ] && [ "$total_un" -gt 0 ]; then raw_txt=$(human_bytes "$total_un") if [ "$un_partial" -eq 1 ]; then printf "\nTotal: %s [%s] (partial)\n\n" \ "$(color green "$disk_txt")" \ "$(color dim "$raw_txt")" else printf "\nTotal: %s [%s]\n\n" \ "$(color green "$disk_txt")" \ "$(color dim "$raw_txt")" fi else printf "\nTotal: %s\n\n" "$(color green "$disk_txt")" fi printf "\n" } # Translate @@SECTION@@ / @@SUB@@ markers into nice output format_marked_output() { while IFS= read -r line; do case "$line" in '@@SECTION@@ '*) section "${line#@@SECTION@@ }" ;; '@@SUB@@ '*) subsection "${line#@@SUB@@ }" ;; *) printf '%s\n' "$line" ;; esac done } # Optional progress meter for large logs; prefers pv (pretty), else uses proper # gawk counter progress_wrap() { # pv is fast and already shows lines, time, rate, and a bar if have_cmd pv; then pv -l -N lines -i 1 -B 4m # Move the cursor to a fresh line before report output starts printf '\n' >&2 return fi # Fallback progress meter implemented in gawk # Prints one update per second and streams all input unchanged gawk ' # Format numbers like 869, 10.2k, 2.2M function human(n, s) { if (n < 1000) return n "" if (n < 1000000) s = sprintf("%.1fk", n/1000.0) else if (n < 1000000000) s = sprintf("%.1fM", n/1000000.0) else s = sprintf("%.1fG", n/1000000000.0) if (s ~ /\.0[kMG]$/) sub(/\.0/, "", s) return s } # Format seconds as m:ss or h:mm:ss function hms(sec, h,m,s) { h = int(sec / 3600) m = int((sec % 3600) / 60) s = int(sec % 60) if (h > 0) return sprintf("%d:%02d:%02d", h, m, s) return sprintf("%d:%02d", m, s) } BEGIN { # Track start time and last update time start = systime() last = start lastc = 0 } { # Pass through input unchanged c++ print # Update once per second (keeps overhead low) now = systime() if (now != last) { dt = now - last if (dt < 1) dt = 1 # Rate over the last second window rate = (c - lastc) / dt el = now - start # \r rewrites the same terminal line printf "\rLines: %-8s Time: %-6s Rate: %-8s", \ human(c), hms(el), human(rate) "/s" > "/dev/stderr" last = now lastc = c } } END { # Final summary line plus an extra blank line now = systime() el = now - start if (el < 1) el = 1 rate = c / el printf "\rLines: %-8s Time: %-6s Rate: %-8s\n\n", \ human(c), hms(el), human(rate) "/s" > "/dev/stderr" # Make sure stderr is flushed before the report starts fflush("/dev/stderr") } ' } # Parse Apache access logs and emit report text with @@ markers; writes a meta # file with top IPs (countip) for GeoIP lookup. run_access_stats() { local top_n="$1" local meta_file="$2" local with_uas="$3" need_cmd gawk # Use the C locale for faster regex and character classes LC_ALL=C gawk \ -v TOP="$top_n" \ -v META="$meta_file" \ -v WITH_UA="$with_uas" ' function trim_dot0(s) { if (s ~ /\.0[kMGTP]$/) sub(/\.0/, "", s) return s } function hn(n, s) { if (n < 1000) return n "" if (n < 1000000) s = sprintf("%.1fk", n/1000.0) else if (n < 1000000000) s = sprintf("%.1fM", n/1000000.0) else if (n < 1000000000000) s = sprintf("%.1fG", n/1000000000.0) else if (n < 1000000000000000) s = sprintf("%.1fT", n/1000000000000.0) else s = sprintf("%.1fP", n/1000000000000000.0) return trim_dot0(s) } function hb(n, s) { if (n < 1000) return n "" if (n < 1000000) s = sprintf("%.1fk", n/1000.0) else if (n < 1000000000) s = sprintf("%.1fM", n/1000000.0) else if (n < 1000000000000) s = sprintf("%.1fG", n/1000000000.0) else if (n < 1000000000000000) s = sprintf("%.1fT", n/1000000000000.0) else s = sprintf("%.1fP", n/1000000000000000.0) return trim_dot0(s) } function mon_num(m) { return (m=="Jan")?1:(m=="Feb")?2:(m=="Mar")?3:(m=="Apr")?4: (m=="May")?5:(m=="Jun")?6:(m=="Jul")?7:(m=="Aug")?8: (m=="Sep")?9:(m=="Oct")?10:(m=="Nov")?11:(m=="Dec")?12:0 } function epoch_from_timepair(t0, tz0, tt,d,monS,y,h,mi,s,mon,rest) { tt = t0 " " tz0 gsub(/^\[/, "", tt) gsub(/\]$/, "", tt) split(tt, rest, ":") split(rest[1], dmy, "/") d = dmy[1]; monS = dmy[2]; y = dmy[3] h = rest[2]; mi = rest[3]; s = rest[4] sub(/[^0-9].*$/, "", s) mon = mon_num(monS) if (!mon) return 0 return mktime(sprintf("%d %d %d %d %d %d", y, mon, d, h, mi, s)) } function emit_top_ips(arr, n, k,c,i) { i = 0 PROCINFO["sorted_in"] = "@val_num_desc" for (k in arr) { c = arr[k] printf "%d\t%s\n", c, k >> META if (++i >= n) break } delete PROCINFO["sorted_in"] } function print_table(title, arr, n, k,c,i) { printf "@@SUB@@ %s\n", title i = 0 PROCINFO["sorted_in"] = "@val_num_desc" for (k in arr) { c = arr[k] printf " %8s %s\n", hn(c), k if (++i >= n) break } delete PROCINFO["sorted_in"] } function print_table_size(title, cnt, bytes, n, k,c,i,b,lbl) { printf "@@SUB@@ %s\n", title i = 0 PROCINFO["sorted_in"] = "@val_num_desc" for (k in cnt) { c = cnt[k] b = bytes[k] + 0 lbl = (b == 0 ? "redirected" : hb(b)) printf " %8s %s [%s]\n", hn(c), k, lbl if (++i >= n) break } delete PROCINFO["sorted_in"] } function print_table_size_queryonly(title, cnt, bytes, n, k,c,i,b,lbl) { printf "@@SUB@@ %s\n", title i = 0 PROCINFO["sorted_in"] = "@val_num_desc" for (k in cnt) { if (index(k, "?") == 0) continue c = cnt[k] b = bytes[k] + 0 lbl = (b == 0 ? "redirected" : hb(b)) printf " %8s %s [%s]\n", hn(c), k, lbl if (++i >= n) break } delete PROCINFO["sorted_in"] } function print_table_by_size(title, cnt, bytes, n, k,i,b,lbl) { printf "@@SUB@@ %s\n", title i = 0 PROCINFO["sorted_in"] = "@val_num_desc" for (k in bytes) { b = bytes[k] + 0 lbl = (b == 0 ? "redirected" : hb(b)) printf " %8s %s (%s)\n", lbl, k, hn(cnt[k]) if (++i >= n) break } delete PROCINFO["sorted_in"] } { # Split on quotes to get request/referrer/ua cheaply # q[1]=pre, q[2]=request, q[3]=post, q[4]=ref, q[6]=ua total++ nq = split($0, q, "\"") # Tokenize pre-quote to find IP and timestamp fields. np = split(q[1], p, /[ \t]+/) ip = p[1] if (ip == "") ip = "-" # Locate the timestamp token by pattern (more robust than fixed positions) t0 = "" tz0 = "" for (i = 1; i <= np; i++) { if (p[i] ~ /^\[[0-9]{1,2}\/[A-Za-z]{3}\/[0-9]{4}:/) { t0 = p[i] if (i+1 <= np) tz0 = p[i+1] break } } ips[ip]++ # Track min/max timestamp using a cheap sortable key if (t0 != "" && tz0 != "") { tt = t0 gsub(/^\[/, "", tt) split(tt, a, ":") split(a[1], dmy, "/") dd = sprintf("%02d", dmy[1] + 0) mon = mon_num(dmy[2]) yy = dmy[3] + 0 hh = a[2] + 0 mi = a[3] + 0 ss = a[4] sub(/[^0-9].*$/, "", ss) ss = ss + 0 key = sprintf("%04d%02d%02d%02d%02d%02d", yy, mon, dd, hh, mi, ss) if (min_key == "" || key < min_key) { min_key = key min_ts = t0 " " tz0 } if (max_key == "" || key > max_key) { max_key = key max_ts = t0 " " tz0 } # Keep busiest hour stats hour = sprintf("%04d-%02d-%02d %02d", yy, mon, dd, hh) per_hour[hour]++ } # Parse request: METHOD URI PROTO req = q[2] split(req, r, /[ \t]+/) method = r[1] uri_full = r[2] if (method == "") method = "-" if (uri_full == "") uri_full = "-" methods[method]++ # Optionally strip query part to reduce URL cardinality uri = uri_full sub(/\?.*$/, "", uri) # Normalize leading slashes: //path -> /path sub(/^\/+/, "/", uri) sub(/^\/+/, "/", uri_full) # Normalize trailing slash (except for root "/") if (uri != "/") sub(/\/+$/, "", uri) if (uri_full != "/") sub(/\/+$/, "", uri_full) # Parse status and bytes from the post-request chunk status = "-" bytes = 0 if (match(q[3], /[[:space:]]*([0-9]{3})[[:space:]]+([0-9-]+)/, m)) { status = m[1] bytes = m[2] } statuses[status]++ if (status ~ /^[0-9][0-9][0-9]$/) { sclass = substr(status, 1, 1) "xx" classes[sclass]++ } if (bytes == "-" || bytes == "") bytes = 0 bytes += 0 total_bytes += bytes bytes_ip[ip] += bytes # Response size stats (ignore 0-sized to keep min/max sane) if (bytes > 0) { count_bytes++ if (min_bytes == 0 || bytes < min_bytes) min_bytes = bytes if (bytes > max_bytes) max_bytes = bytes } # URL stats uris[uri]++ url_b[uri] += bytes uris_full[uri_full]++ url_full_b[uri_full] += bytes if (status == "404") { nf[uri]++ nf_b[uri] += bytes } if (status ~ /^5/) sx[uri]++ # Referrers ref = q[4] if (ref != "" && ref != "-") refs[ref]++ # User agents if enabled if (WITH_UA == 1) { ua = q[6] if (ua != "" && ua != "-") uas[ua]++ } } END { printf "@@SECTION@@ Access log\n" printf "@@SUB@@ Summary\n" printf " Requests: %s\n", hn(total) uip=0; for (k in ips) uip++ uuri=0; for (k in uris) uuri++ printf " Unique IPs: %s\n", hn(uip) printf " Unique URLs: %s\n", hn(uuri) printf " Bytes sent: %s\n", hb(total_bytes) # Derive a simple req/sec estimate from min/max timestamps if (min_ts != "" && max_ts != "") { split(min_ts, m, " ") split(max_ts, x, " ") min_ep = epoch_from_timepair(m[1], m[2]) max_ep = epoch_from_timepair(x[1], x[2]) t1 = min_ts t2 = max_ts gsub(/^\[/, "", t1); gsub(/\]$/, "", t1) gsub(/^\[/, "", t2); gsub(/\]$/, "", t2) span = max_ep - min_ep if (span < 1) span = 1 rps = total / span printf " Time span: %s — %s\n", t1, t2 printf " Avg req/sec: %.2f\n", rps } # Response size if (total_bytes > 0 && count_bytes > 0) { printf "@@SUB@@ Response size\n" printf " Total: %s\n", hb(total_bytes) printf " Avg: %s\n", hb(int(total_bytes / count_bytes)) printf " Min: %s\n", hb(min_bytes) printf " Max: %s\n", hb(max_bytes) } print_table("Busiest hours", per_hour, (TOP < 12 ? TOP : 12)) print_table("Top IPs", ips, TOP) emit_top_ips(ips, TOP) print_table("Top IPs by bandwidth (bytes summed)", bytes_ip, TOP) print_table_size("Top URLs by count", uris, url_b, TOP) print_table_by_size("Top URLs by size", uris, url_b, TOP) print_table_size_queryonly("Top URLs with query", uris_full, url_full_b, TOP) print_table("Top status codes", statuses, TOP) printf "@@SUB@@ Status classes\n" PROCINFO["sorted_in"] = "@val_num_desc" for (k in classes) printf " %8s %s\n", hn(classes[k]), k delete PROCINFO["sorted_in"] print_table("Top methods", methods, TOP) print_table_size("Top 404 URLs", nf, nf_b, TOP) print_table("Top 5xx URLs", sx, TOP) # Top referrers (always) print_table("Top referrers", refs, TOP) # Top user-agents (only if enabled) if (WITH_UA == 1) print_table("Top user-agents", uas, TOP) # Unique counts (always printed last) if (WITH_UA == 1) printf "\n@@SUB@@ Unique referrers and user-agents\n" else printf "\n@@SUB@@ Unique referrers\n" uref=0; for (k in refs) uref++ printf " Unique referrers: %s\n", hn(uref) if (WITH_UA == 1) { uua=0; for (k in uas) uua++ printf " Unique user-agents: %s\n", hn(uua) } } ' } # Parse Apache error logs and emit report text with @@ markers run_error_stats() { local top_n="$1" need_cmd gawk LC_ALL=C gawk -v TOP="$top_n" ' function trim_dot0(s) { if (s ~ /\.0[kMGTP]$/) sub(/\.0/, "", s) return s } function hn(n, s) { if (n < 1000) return n "" if (n < 1000000) s = sprintf("%.1fk", n/1000.0) else if (n < 1000000000) s = sprintf("%.1fM", n/1000000.0) else if (n < 1000000000000) s = sprintf("%.1fG", n/1000000000.0) else if (n < 1000000000000000) s = sprintf("%.1fT", n/1000000000000.0) else s = sprintf("%.1fP", n/1000000000000000.0) return trim_dot0(s) } function mon_num(m) { return (m=="Jan")?1:(m=="Feb")?2:(m=="Mar")?3:(m=="Apr")?4: (m=="May")?5:(m=="Jun")?6:(m=="Jul")?7:(m=="Aug")?8: (m=="Sep")?9:(m=="Oct")?10:(m=="Nov")?11:(m=="Dec")?12:0 } function epoch_from_error(line, mon,dd,hh,mi,ss,y) { if (match(line, /^\[[A-Za-z]{3} ([A-Za-z]{3}) ([0-9]{1,2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})\.[0-9]+ ([0-9]{4})\]/, a)) { mon = mon_num(a[1]); dd = a[2] hh = a[3]; mi = a[4]; ss = a[5]; y = a[6] return mktime(sprintf("%d %d %d %d %d %d", y, mon, dd, hh, mi, ss)) } if (match(line, /^\[[A-Za-z]{3} ([A-Za-z]{3}) ([0-9]{1,2}) ([0-9]{2}):([0-9]{2}):([0-9]{2}) ([0-9]{4})\]/, b)) { mon = mon_num(b[1]); dd = b[2] hh = b[3]; mi = b[4]; ss = b[5]; y = b[6] return mktime(sprintf("%d %d %d %d %d %d", y, mon, dd, hh, mi, ss)) } return 0 } function print_table(title, arr, n, k,c,i) { printf "@@SUB@@ %s\n", title i = 0 PROCINFO["sorted_in"] = "@val_num_desc" for (k in arr) { c = arr[k] printf " %8s %s\n", hn(c), k if (++i >= n) break } delete PROCINFO["sorted_in"] } { total++ # Track min/max timestamps for time span ep = epoch_from_error($0) if (ep > 0) { if (min_ep == 0 || ep < min_ep) { min_ep = ep; min_t = substr($0, 2, index($0, "]")-2) } if (max_ep == 0 || ep > max_ep) { max_ep = ep; max_t = substr($0, 2, index($0, "]")-2) } } # Capture [level] and [module:level] forms if (match($0, /\] \[([^:\]]+)(:([^\]]+))?\]/, m)) { if (m[3] != "") { mod = m[1]; lvl = m[3] } else { mod = "(none)"; lvl = m[1] } modules[mod]++ levels[lvl]++ } # Extract client IP and AH codes (if present) if (match($0, /\[client ([0-9A-Fa-f:\.]+)\b/, c)) clients[c[1]]++ if (match($0, /AH[0-9]{5}/, a)) ah[a[0]]++ # Create a short "message start" by stripping leading [..] blocks msg = $0 while (match(msg, /^\[[^]]+\] /)) sub(/^\[[^]]+\] /, "", msg) gsub(/[ \t]+/, " ", msg) if (length(msg) > 160) msg = substr(msg, 1, 160) "..." msgs[msg]++ } END { printf "@@SECTION@@ Error log\n" printf "@@SUB@@ Summary\n" printf " Lines: %s\n", hn(total) if (min_ep > 0 && max_ep > 0) printf " Time span: %s — %s\n", min_t, max_t print_table("Levels", levels, TOP) print_table("Modules", modules, TOP) print_table("Client IPs", clients, TOP) print_table("AH codes", ah, TOP) print_table("Common message starts", msgs, TOP) } ' } # Find a GeoIP2 country database file (.mmdb), or return empty geoip2_db_file() { if [ -n "${GEOIP_DB:-}" ] && [ -r "$GEOIP_DB" ]; then printf '%s' "$GEOIP_DB" return fi # Common path on EL systems local f="/usr/share/GeoIP/GeoLite2-Country.mmdb" if [ -r "$f" ]; then printf '%s' "$f" return fi for f in \ /usr/share/geoip/*.mmdb \ /usr/share/GeoIP/*.mmdb \ /var/lib/GeoIP/*.mmdb do if [ -r "$f" ] && [[ "$f" == *Country*.mmdb ]]; then printf '%s' "$f" return fi done printf '' } # Extract the first quoted string from mmdblookup output mmdb_first_string() { awk -F'"' 'NF>=2 {print $2; exit}' } # GeoIP2 lookup via mmdblookup (returns "CCName") geoip2_lookup_ip() { local ip="$1" local db db=$(geoip2_db_file) if [ -z "$db" ] || ! have_cmd mmdblookup; then printf '??\tUnknown' return fi local cc name cc=$(mmdblookup --file "$db" --ip "$ip" country iso_code \ 2>/dev/null | mmdb_first_string || true) name=$(mmdblookup --file "$db" --ip "$ip" country names en \ 2>/dev/null | mmdb_first_string || true) [ -z "$cc" ] && cc="??" [ -z "$name" ] && name="Unknown" printf '%s\t%s' "$cc" "$name" } # Parse legacy geoiplookup output into "CCName" geoip_parse() { local out="$1" out=${out#*: } local cc="${out%%,*}" local name="${out#*,}" name="${name# }" [ -z "$cc" ] && cc="??" [ "$cc" = "--" ] && cc="??" [ -z "$name" ] && name="Unknown" [ "$name" = "N/A" ] && name="Unknown" printf '%s\t%s' "$cc" "$name" } # GeoIP lookup wrapper: prefer geoiplookup, fallback to GeoIP2 mmdblookup geoip_lookup_ip() { local ip="$1" local out="" if [[ "$ip" == *:* ]] && have_cmd geoiplookup6; then out=$(geoiplookup6 "$ip" 2>/dev/null || true) elif have_cmd geoiplookup; then out=$(geoiplookup "$ip" 2>/dev/null || true) fi if [ -n "$out" ]; then geoip_parse "$out" return fi geoip2_lookup_ip "$ip" } # Print GeoIP sections for the top IP list (fast and useful) by reading the # countip from the meta file produced by run_access_stats print_geoip_sections() { local meta_file="$1" # Require either legacy GeoIP or GeoIP2 support if ! have_cmd geoiplookup && ! have_cmd geoiplookup6; then if ! have_cmd mmdblookup; then return fi if [ -z "$(geoip2_db_file)" ]; then return fi fi [ ! -s "$meta_file" ] && return declare -A GEO_CC declare -A GEO_NAME declare -A COUNTRY_SUM declare -A COUNTRY_NAME local count ip cc_name cc name local max_ip_len=0 # First pass to cache Geo results and measure max IP width while IFS=$'\t' read -r count ip; do [ -z "${count:-}" ] && continue [ -z "${ip:-}" ] && continue if [ "${#ip}" -gt "$max_ip_len" ]; then max_ip_len="${#ip}" fi if [ -n "${GEO_CC[$ip]+x}" ]; then continue fi cc_name=$(geoip_lookup_ip "$ip") cc="${cc_name%%$'\t'*}" name="${cc_name#*$'\t'}" GEO_CC["$ip"]="$cc" GEO_NAME["$ip"]="$name" done < "$meta_file" # Second pass to sum countries from the top IP list while IFS=$'\t' read -r count ip; do [ -z "${count:-}" ] && continue [ -z "${ip:-}" ] && continue cc="${GEO_CC[$ip]}" name="${GEO_NAME[$ip]}" COUNTRY_SUM["$cc"]=$(( ${COUNTRY_SUM["$cc"]:-0} + count )) COUNTRY_NAME["$cc"]="$name" done < "$meta_file" section "GeoIP (top IPs)" while IFS=$'\t' read -r count ip; do [ -z "${count:-}" ] && continue [ -z "${ip:-}" ] && continue cc="${GEO_CC[$ip]}" name="${GEO_NAME[$ip]}" printf " %s %s %s %s\n" \ "$(pad_color 8 green "$(human_count "$count")")" \ "$(pad_color_left "$max_ip_len" dim "$ip")" \ "$(pad_color 2 cyan "$cc")" \ "$(color dim "$name")" done < "$meta_file" printf "\n" section "GeoIP (top countries from top IPs)" { for cc in "${!COUNTRY_SUM[@]}"; do printf '%s\t%s\t%s\n' \ "${COUNTRY_SUM[$cc]}" "$cc" "${COUNTRY_NAME[$cc]}" done } | sort -rn | while IFS=$'\t' read -r sum cc name; do printf " %s %s %s\n" \ "$(pad_color 8 green "$(human_count "$sum")")" \ "$(pad_color 2 cyan "$cc")" \ "$(color dim "$name")" done printf "\n%s\n" \ "$(color dim "Note: GeoIP sums use only the top-IP list.")" } # Main entry to parse args, build file list, stream logs, print report main() { init_color local type="auto" local number=25 local rotated_depth=0 local show_progress=0 local geoip_disabled=0 local with_uas=0 local -a inputs=() # Parse CLI args. -i/--input can be repeated and keeps order while [ "$#" -gt 0 ]; do case "$1" in -i|--input) shift [ "${1:-}" = "" ] && usage inputs+=("$1") ;; -t|--type) shift [ "${1:-}" = "" ] && usage type="$1" ;; -n|--number) shift [ "${1:-}" = "" ] && usage number="$1" ;; -R|--rotated) if [ "${2:-}" != "" ] && [[ "${2:-}" =~ ^[0-9]+$ ]]; then rotated_depth="$2" shift else rotated_depth=-1 fi ;; --user-agents) with_uas=1 ;; --progress) show_progress=1 ;; --no-geoip) geoip_disabled=1 ;; --no-color) USE_COLOR=0 NO_COLOR=1 ;; -h|--help) usage ;; *) printf '%s\n' \ "$(color red "Error: unknown option: $1")" >&2 usage ;; esac shift done # Auto-enable progress only for interactive runs if [ "$show_progress" -eq 0 ] && [ -t 2 ] && [ -z "${NO_COLOR:-}" ]; then show_progress=1 fi [ "${#inputs[@]}" -eq 0 ] && usage need_cmd gzip need_cmd gawk # Validate inputs before doing any work. local in for in in "${inputs[@]}"; do if [ ! -f "$in" ]; then printf '%s\n' \ "$(color red "Error: file not found: $in")" >&2 exit 1 fi done # Auto-detect type from the first input if not forced if [ "$type" = "auto" ]; then local guessed="" guessed=$(guess_type_from_name "${inputs[0]}" || true) if [ "$guessed" = "" ]; then guessed=$(guess_type_from_content "${inputs[0]}" || true) fi type="${guessed:-access}" fi if [ "$type" != "access" ] && [ "$type" != "error" ]; then printf '%s\n' \ "$(color red "Error: bad type: $type")" >&2 exit 1 fi if ! [[ "$number" =~ ^[0-9]+$ ]] || [ "$number" -lt 1 ]; then printf '%s\n' \ "$(color red "Error: bad --number value: $number")" >&2 exit 1 fi # Build final file list in input order, then add rotated siblings per # input local -a files=() local detected for in in "${inputs[@]}"; do detected=$(guess_type_from_name "$in" || true) if [ "$detected" = "" ]; then detected=$(guess_type_from_content "$in" || true) fi if [ "$detected" != "" ] && [ "$detected" != "$type" ]; then printf '%s\n' "$(color red "Error: mixed log types")" >&2 printf '%s\n' \ "$(color dim "Tip: run access and error logs separately.")" \ >&2 exit 1 fi while IFS= read -r f; do files+=("$f") done < <(collect_files_for_input "$in" "$rotated_depth") done # tar is needed only if we actually see tarballs local f for f in "${files[@]}"; do if [[ "$f" =~ \.tar\.(gz|bz2|xz|zst)$|\.t(gz|bz2|xz|zst)$ ]]; then need_cmd tar break fi done # Require decompress tools only if we are going to read those formats for f in "${files[@]}"; do case "$f" in *.bz2|*.tbz2|*.tar.bz2) need_cmd bzip2 ;; *.xz|*.txz|*.tar.xz) need_cmd xz ;; *.zst|*.tzst|*.tar.zst) need_cmd zstd ;; *.zip) need_cmd unzip ;; esac done # Use a temp dir for small meta files (top IP list and line count) local tmp_dir tmp_dir=$(mktemp -d) trap 'cleanup_tmp "'"$tmp_dir"'"' EXIT local meta_top_ips="$tmp_dir/top_ips.tsv" local line_count_file="$tmp_dir/lines.txt" section "Apache log stats" printf "Type: %s\n" "$(color cyan "$type")" printf "Top entries: %s\n" "$(color cyan "$number")" printf "Inputs: %s\n" "$(color cyan "${#inputs[@]}")" if [ "$rotated_depth" -eq 0 ]; then printf "Rotated: %s\n" "$(color cyan "none")" elif [ "$rotated_depth" -eq -1 ]; then printf "Rotated: %s\n" "$(color cyan "all")" else printf "Rotated: %s\n" "$(color cyan "yes (newest $rotated_depth)")" fi local progress_desc="disabled" if [ "$show_progress" -eq 1 ]; then progress_desc="enabled" fi printf "Progress: %s\n" "$(color cyan "$progress_desc")" if [ "$type" = "access" ]; then local ua_desc="disabled" if [ "$with_uas" -eq 1 ]; then ua_desc="enabled" fi printf "User agents: %s\n" "$(color cyan "$ua_desc")" fi # GeoIP can work via geoiplookup or via mmdblookup+db local geoip_possible=0 if [ "$geoip_disabled" -eq 0 ] && ( have_cmd geoiplookup || have_cmd geoiplookup6 \ || ( have_cmd mmdblookup && [ -n "$(geoip2_db_file)" ] ) ); then geoip_possible=1 fi local geoip_desc="not available" if [ "$geoip_disabled" -eq 1 ]; then geoip_desc="disabled" elif [ "$geoip_possible" -eq 1 ]; then geoip_desc="enabled" fi printf "GeoIP: %s\n\n" "$(color cyan "$geoip_desc")" print_file_list "${files[@]}" section "Parsing" printf "%s\n\n" "$(color dim "Reading and analyzing logs...")" # Stream all selected files once and tee counts lines in parallel if [ "$type" = "access" ]; then ( for f in "${files[@]}"; do stream_file "$f" done ) | tee >(wc -l | tr -d " " > "$line_count_file") \ | { if [ "$show_progress" -eq 1 ]; then progress_wrap else cat fi } | run_access_stats "$number" "$meta_top_ips" "$with_uas" \ | format_marked_output else ( for f in "${files[@]}"; do stream_file "$f" done ) | tee >(wc -l | tr -d " " > "$line_count_file") \ | { if [ "$show_progress" -eq 1 ]; then progress_wrap else cat fi } | run_error_stats "$number" \ | format_marked_output fi # Use the tee-produced counter to detect empty streams local parsed_lines parsed_lines=$(cat "$line_count_file" 2>/dev/null || printf '0') if ! [[ "$parsed_lines" =~ ^[0-9]+$ ]]; then parsed_lines=0 fi if [ "$parsed_lines" -eq 0 ]; then printf '\n%s\n' \ "$(color yellow "No lines parsed (empty log stream).")" >&2 exit 2 fi printf "\n" section "Done" printf " Parsed lines: %s\n" \ "$(color green "$(human_count "$parsed_lines")")" printf "\n" # GeoIP uses the meta file written by run_access_stats. if [ "$type" = "access" ] && [ "$geoip_possible" -eq 1 ]; then print_geoip_sections "$meta_top_ips" fi printf "\n" } main "$@"