#!/usr/bin/perl -CAS # █ H Ascii histograms and bar charts. # █▃ I # ▂██ Input is a stream of numbers for histograms, uniq -c output for bar charts. # ███ y-axis scale (--height), resolution is N times this (default N=8). # ███ number of bins (--bins, default 80) # ███ S F Optional subselection of data (--min --max) # ▃███▇ Y I --cumulative # █████ S ▅█▄ T --histin if input is already histogram counts # █████ ▁▆███▁ quartiles indicated # ▄█████ ▁▆██████ default Unicode block glyphs u2581-u2588, --plain for retro-style. # ██████▅ ████████▃ # ███████ ▁██████████ bar charts with --tallyin if input is output of uniq (line format: ) # ▄███████▆███████████▆ # █████████████████████ # █████████████████████▇ # ▇██████████████████████▄ # ▁████████████████████████▂ # ██████████████████████████▆▃ # ▃████████████████████████████▇▇▅▄▄▅▄▂▃▃▄▃▃▃▄▄▄▄▄▅▄▃▄▃▂▁ # ▄████████████████████████████████████████████████████████▇▇▅▄▅▃▃▂▂▂▁▁▁▁▁ # 0.0 .____Q____._Q__,___Q.____,____.____,____.____,____.____,____.____,____._ 3.0 # ND=273410 oor=356/0.1% hmin=0 hmax=3 dmin=0.01409641 dmax=5.863222 # topbin(i=9,n=16880,p=6.2,x~0.31875) bw=0.0375 q=(0.3414933[9],0.620431[16],0.8757837[23]) # Thanks to Izaak van Dongen, Vlad Kisilev and Leopold Parts for feature suggestions. # - add mean/sdev? # - perhaps add optional output use Getopt::Long; use strict; use warnings; use Scalar::Util qw(looks_like_number); use List::Util qw(max min); use utf8; use POSIX qw(floor ceil); my $progname = 'hissyfit'; my @ARGV_COPY = @ARGV; my $n_args = @ARGV; my $help = 0; my $debug = 0; my $cumulative = 0; my $terse = 0; my $plain = 0; my $takelog = 0; my $emoji = 0; my $block = 0; my $annotpos = undef; my $histin = 0; # input data is already histogram data my $countsin = 0; # input data is numerical value plus count my $tallyin = 0; # input data is output of uniq my $annot = ""; my $height = 20; my $SUBLEVELS = 8; # capitals for constants maybe my $user_min = undef; my $user_max = undef; my $user_Ymax = undef; my $NBINS = 100; my $user_bins = 0; my $zilch = ' '; my $canvaslrh = undef; my $notches = ''; my $dquery = ''; my $staircase = '▁-▂-▃-▄-▅-▆-▇-█'; my $stonehenge = '▏-▎-▍-▌-▋-▊-▉-█'; my $stairsep = '-'; my $emosep = '🐙'; sub help { print < ) --terse do not print quantile and bin information --annot= output 'Description: STRING' in the legend below the histogram --stairs=x-y[-z]* use these as cursor sublevels (argument will split on '$stairsep' or --sep) --stair-sep=X use X as separator to split stairs (default $stairsep) --zilch=S use S as filler for histogram (default space) --ymax=NUM set maximum for y axis; ignored if too small Normal output is formed by vertical bars (regular histogram for numerical distributions). With --tallyin horizontal bars are output, preceded by labels, followed by counts. Legend explanation: ND=452683 Loor=1337/0.3% Roor=14429/3.1% hmin=-10 hmax=190 dmin=-7656 dmax=7701 topbin(i=4,n=41407,p=8.8,x~-3) bw=2 qr=(2 95 103) ND Number of Data Loor Left out of range count/percentage, if --min caused data to be filtered Roor Right out of range count/percentage, if --max caused data to be filtered hmin Minimum of data range as represented in histogram hmax Maximum of data range as represented in histogram bw Bin width: bw = (hmin - hmax) / nbins dmin Smallest data item dmax Largest data item topbin Highest bin. i: index, n: count, p: percentage, x: middle of bin qr The three data quartile values for full range of input data EOH } if (! GetOptions ( "help" => \$help , "plain" => \$plain , "block" => \$block , "debug" => \$debug , "cumulative" => \$cumulative , "height=i" => \$height , "bins=i" => \$user_bins , "min=f" => \$user_min , "max=f" => \$user_max , "ymax=f" => \$user_Ymax , "cv=s" => \$canvaslrh , "x=s" => \$notches , "d=s" => \$dquery , "stairs=s" => \$staircase , "stair-sep=s" => \$stairsep , "zilch=s" => \$zilch , "emoji" => \$emoji , "log2" => \$takelog , "histin" => \$histin , "countsin" => \$countsin , "tallyin" => \$tallyin , "terse" => \$terse , "annot=s" => \$annot , "annotpos=s" => \$annotpos , "emoji_" => \$emosep ) ) { print STDERR "option processing failed\n"; exit(1); } $NBINS = $user_bins if $user_bins; if ($help) { help(); exit(0); } die "Two many modes (--histin --tallyin)\n" if $tallyin + $histin >= 2; if (defined($canvaslrh)) { if ($canvaslrh =~ s|^(\S+?)/(\S+?)/(\S+)/(\S+)$|$&|) { ($user_min, $user_max, $NBINS, $height) = ($1, $2, $3, $4); } elsif ($canvaslrh =~ s|^(\S+?)/(\S+?)/(\S+)$|$&|) { ($user_min, $user_max, $NBINS) = ($1, $2, $3); } elsif ($canvaslrh =~ s|^(\S+?)/(\S+?)$|$&|) { ($user_min, $user_max) = ($1, $2); } } if ($annot && $annot =~ /(.*?)\/(tr|tl|br|bl)$/) { ($annot, $annotpos) = ($1, $2); } my @notches = (); my $notch_anchor = undef; my $notch_skip = undef; if ($notches =~ /(\S+?)%(\S+)/) { ($notch_anchor, $notch_skip) = ($1, $2); } else { @notches = sort { $a <=> $b } grep { looks_like_number($_) } split '/', $notches; } my @dquery = defined($dquery) ? sort { $a <=> $b } grep { looks_like_number($_) } split '/', $dquery : (); if ($plain) { $staircase = ".,-,:,',|"; $stairsep = ','; $staircase = '-,:,+,=' if $tallyin; } elsif ($emoji) { # say hi to flushed, woozy, monocle, grinning and sunglasses. $staircase = '😳-🥴-🧐-😀-😎'; $zilch = ' ' unless $zilch ne ' '; # Used to have 'anxious face with sweat' 😰 } elsif ($tallyin) { $staircase = $stonehenge; } die "Bins in [1,1000] please or edit code\n" unless $NBINS >= 1 && $NBINS <= 1000; my @D = (); my $n_warn = 0; while (<>) { chomp; my $d = $_; if (!$countsin && !$tallyin && !looks_like_number($d)) { print STDERR "Skipping non-numerical input [$d]\n" if $n_warn++ < 5; print STDERR "Further warnings suppressed\n" if $n_warn == 5; next; } if ($countsin) { $d =~ /^\s*(\S+)\s+(\d+)\s*$/ && looks_like_number($1) || die "Could not parse numeric/counts pair ($d)\n"; my ($val, $count) = ($1, $2); push @D, ( ($val) x $count ); } else { push @D, $_; } } if (!@D) { print "No data\n"; exit 0; } my @H; my ($dmin, $dmax, $hmin, $hmax, $q1, $q2, $q3, $out_of_range, $ND_orig, $ND_hist, $ND_left_cut, $ND_right_cut, $ND, $width); my @labels = (); if ($tallyin) { my @D2 = (); for my $d (@D) { $d =~ /(\d+)\s+(.*)/ || die "Could not parse tally pair ($d)\n"; push @labels, $2; push @D2, $1; } @D = @D2; @H = @D2; $NBINS = @H; $hmin = $dmin = 0; # this lot is not used in this mode $hmax = $NBINS; # but it's just as well to set them. $width = 0; # $ND = 0; $ND += $_ for @H; $ND_orig = $ND; $ND_hist = $ND; } elsif ($histin) { @H = @D; $hmin = $dmin = 0; $hmax = $dmax = 100; $hmin = $user_min if defined($user_min); $hmax = $user_max if defined($user_max); $width = ($hmax - $hmin) / $NBINS; $q1 = $q2 = $q3 = '-'; $out_of_range = 0; if ($user_bins) { @H = (); my $J = 0; $NBINS = min($NBINS, 0 + @D); for (my $i=1; $i<=$NBINS; $i++) { push @H, 0; my $endpoint = int((@D * $i) / ($NBINS)); for (my $j=$J; $j<$endpoint; $j++) { $H[-1] += $D[$j]; } $H[-1] = int($H[-1] / ($endpoint - $J)); $J = $endpoint; } $ND = @H; } else { $NBINS = @H; $ND = 0; $ND += $_ for @H; $ND_orig = $ND; } $ND_orig = $ND; $ND_hist = $ND; } else { # Nearly ready to stick this in a subroutine. @D = sort { $a <=> $b} @D; $dmin = $D[0]; $dmax = $D[-1]; $q1 = $D[int((@D-1)/4)]; # Not going to sweat ties or even off-by-1. $q2 = $D[int((@D-1)/2)]; # What they said. $q3 = $D[int(3*(@D-1)/4)]; # ^ $hmin = $dmin; $hmax = $dmax; $out_of_range = 0; if (defined($user_min)) { my $umin = $user_min; $out_of_range++ if $umin > $dmin; $hmin = $umin; } if (defined($user_max)) { my $umax = $user_max; $out_of_range++ if $umax < $dmax; $hmax = $umax; } $ND_orig = @D; $ND_left_cut = 0; $ND_right_cut = 0; if ($out_of_range) { for (@D) { $ND_left_cut++ if $_ < $hmin; $ND_right_cut++ if $_ >= $hmax } @D = grep { $_ >= $hmin && $_ < $hmax } @D; } $ND_hist = @D; if (!@D) { print "No data present in bounds [$hmin, $hmax)\n"; exit 0; } ## This is input done. ## Now compute histogram data. $width = ($hmax-$hmin) / $NBINS; @H = (); # histogram data my $N_run = 0; my $threshold_1 = $hmin; my $threshold_2 = $hmin + $width; my $index = 0; my $d = 0; my $N_assigned = 0; while ($d < @D && $index < $NBINS) { my $item = $D[$d]; if ($item >= $threshold_1 && $item < $threshold_2) { $N_run++; $d++; } elsif ($item >= $threshold_2) { $H[$index] = $N_run; $N_assigned += $N_run; $N_run = 0; $index++; $threshold_1 = $threshold_2; $threshold_2 = $hmin + ($index+1) * $width; } } if ($N_run) { die "run error\n" if $index == $NBINS; $H[$index++] = $N_run; $N_assigned += $N_run; } while ($index < $NBINS) { # can happen under various circumstances, e.g. user boundaries. $H[$index] = 0; $index++; } $H[$NBINS-1] += ($ND_hist-$N_assigned); # this controls for floating point addition causing boundary issues. my $lh = @H; die "hist error ($lh)\n" unless @H == $NBINS; local $" = ' '; print STDERR "hval(@H)\n" if $debug; } ## This binned the data in a rather painful way. ## Now cumulative data is easy. We need it always, e.g. for quartiles. ## tallyin: no harm, ignored. my @C = ($H[0]); # cumulative histogram data for (my $i=1;$i<$NBINS; $i++) { $C[$i] = $C[$i-1] + $H[$i]; } ## This is histogram data done. ## Now set some paint parameters, compute a few stats. my @canvas = (); my %map = (); my @stairs = split($stairsep, $staircase); $SUBLEVELS = @stairs; my $bigbrick = $stairs[$SUBLEVELS-1]; for (my $j=1;$j<@stairs;$j++) { $map{$j} = $stairs[$j-1]; } my $bigbin = ( sort { $b <=> $a } @H )[0]; my $bigbinid = (grep { $H[$_] == $bigbin } 0..($NBINS-1))[0] + 1; # 1-based for user. my $bigbinmid = $hmin + ($bigbinid-0.5)*$width; my $Ymax = $cumulative ? $ND_hist : $bigbin; if (defined($user_Ymax)) { if ($Ymax > $user_Ymax) { print STDERR "--ymax value $user_Ymax is lower than data ymax, ignoring\n"; } else { $Ymax = $user_Ymax; } } if ($takelog) { $Ymax = int(1 + log($Ymax)/log(2)); } ## Parameters set. ## Paint the canvas, transposed. my $NUNIT = $height * $SUBLEVELS; # this variable needs capitals. # local $" = ' '; # print STDERR "-- (@D) (@H) (@C) $Ymax\n"; for (my $i=0;$i<$NBINS;$i++) { my $n = $cumulative ? $C[$i] : $H[$i]; if ($takelog && $n > 0) { $n = int(1 + log($n)/log(2)); } my $nunit = int($NUNIT*$n/$Ymax); # support maxinum of $NUNIT steps. my $Nrem = $nunit % $SUBLEVELS; # one char represents $SUBLEVELS $NUNITs. my $Nfull = ($nunit - $Nrem) / $SUBLEVELS; die "sum error\n" if $Nfull + ($Nrem > 0) > $height; my $Nzilch = $height - $Nfull - ($Nrem > 0); $canvas[$i] = [ ($bigbrick) x $Nfull ]; push @{$canvas[$i]}, $map{$Nrem} if $Nrem; push @{$canvas[$i]}, ($zilch) x $Nzilch; die "canvas paint error\n" unless @{$canvas[$i]} == $height; } ## Canvas done. ## Give it to the nice people. if ($tallyin) { my $n_trunc = 0; for (my $v=0; $v<$NBINS;$v++) { my $plabel = substr($labels[$v], 0, 20); my ($lb, $rb) = ('[', ']'); if (length($plabel) < length($labels[$v])) { ($lb, $rb) = ('(', ')'); $n_trunc++; } printf "%22s ", ( $lb . substr($labels[$v], 0, 20) . $rb ); for (my $u=0; $u<$height; $u++) { print $canvas[$v][$u]; } printf " %12d\n", $H[$v]; } my $bigbinpct = sprintf("%.1f", 100 * $bigbin / $ND_hist); print "total $ND_hist items, most frequent [$labels[$bigbinid-1]] (p=$bigbinpct)\n"; print "$n_trunc labels were truncated\n" if $n_trunc; exit(0); } if (defined($annotpos)) { print ' ' x ($NBINS - length($annot)), $annot, "\n" if $annotpos =~ /^tr$/i; print $annot, ' ' x ($NBINS - length($annot)), "\n" if $annotpos =~ /^tl$/i; } for (my $u=0; $u<$height; $u++) { for (my $v=0; $v<$NBINS;$v++) { print $canvas[$v][$height-$u-1]; } print "\n"; } ## Canvas given. ## Print some stats, go to some lengths to plot quartile locations. my $sep = '____.____,' x int(($NBINS+10)/10); $sep = substr($sep, 0, $NBINS); if ($NBINS >= 80) { my $sep_a = sprintf("%.1f ", $hmin); my $sep_z = sprintf(" %.1f", $hmax); substr($sep, 0, length($sep_a)) = $sep_a; substr($sep, length($sep) - length($sep_z), length($sep_z)) = $sep_z; } $sep = $emosep x $NBINS if $emoji; my $emit_quantiles = 1; # $histin || ($q1 > $hmin && $q3 < $hmax); # $p[123] are 1-based bin indexes. # my $p1 = 1 + (grep { $C[$_] >= 0.25 * $ND_hist } 0..($NBINS-1))[0]; my $p2 = 1 + (grep { $C[$_] >= 0.50 * $ND_hist } 0..($NBINS-1))[0]; my $p3 = 1 + (grep { $C[$_] >= 0.75 * $ND_hist } 0..($NBINS-1))[0]; # data was subselected, but for now we produce quantiles for # full input data always. if (!$histin && $emit_quantiles && $ND_hist != $ND_orig) { $p1 = $q1 > $hmin && $q1 < $hmax ? 1 + (grep { $C[$_] > 0 && $D[$C[$_]-1] >= $q1 } 0..($NBINS-1))[0] : 0; $p2 = $q2 > $hmin && $q2 < $hmax ? 1 + (grep { $C[$_] > 0 && $D[$C[$_]-1] >= $q2 } 0..($NBINS-1))[0] : 0; $p3 = $q3 > $hmin && $q3 < $hmax ? 1 + (grep { $C[$_] > 0 && $D[$C[$_]-1] >= $q3 } 0..($NBINS-1))[0] : 0; } print STDERR "c99=$C[$NBINS-1] nd=$ND_hist q=$q1,$q2,$q3 ql=$p1,$p2,$p3 qlv=$D[$C[$p1]-1],$D[$C[$p2]-1],$D[$C[$p3]-1]\n" if $debug; if ($emit_quantiles) { substr($sep, $p1-1, 1) = $emoji ? '🌘' : 'Q' if $p1; substr($sep, $p2-1, 1) = $emoji ? '🌗' : 'Q' if $p2; substr($sep, $p3-1, 1) = $emoji ? '🌖' : 'Q' if $p3; } print "$sep\n"; my ($Loor, $Roor) = ("", ""); if ($out_of_range) { my $N_out_of_range = $ND_orig - $ND_hist; print STDERR "Out of range discprepancy $N_out_of_range != $ND_left_cut + $ND_right_cut\n" if $N_out_of_range != $ND_left_cut + $ND_right_cut; $Loor = ' Loor=' . $ND_left_cut . sprintf("/%.1f", 100 * $ND_left_cut / $ND_orig) . '%'; $Roor = ' Roor=' . $ND_right_cut . sprintf("/%.1f", 100 * $ND_right_cut / $ND_orig) . '%'; } if (defined($notch_anchor) || @notches || @dquery) { @dquery = grep { $_ >= $hmin && $_ <= $hmax } @dquery; my $u = $emoji ? 2 : 1; my $c = $emoji ? '=' : ' '; my $ruler = $c x ($NBINS * $u); my $delim = '▭' x ($NBINS * $u); my $vals1 = ' ' x ($NBINS * $u); my $vals2 = ' ' x ($NBINS * $u); my $f = 1.0; # 0.9999 sometimes? needed? for awkward boundaries; ongoing. if (defined($notch_anchor) && $notch_skip) { my $start = $notch_anchor - $notch_skip * floor(($notch_anchor - $hmin)/$notch_skip); my $end = $notch_anchor + $notch_skip * ceil(($hmax - $notch_anchor)/$notch_skip); my $n = ceil(($end - $start) / $notch_skip); @notches = grep { $_ >= $hmin && $_ <= $hmax } map { $start + $_ * $notch_skip } 0..$n; } for (@notches) { my $x = int($f * ($_ - $hmin) / $width); if ($x >= 0 && $x < $NBINS) { substr($ruler, $u * $x, 1) = '|'; substr($vals1, $u * $x, length($_)) = $_; } if ($x == $NBINS) { $ruler .= '|'; $vals1 .= $_; } } for (@dquery) { my $x = int($f * ($_ - $hmin) / $width); if ($x >= 0 && $x < $NBINS) { substr($ruler, $u * $x, 1) = '◆'; substr($vals2, $u * $x, length($_)) = $_; } if ($x == $NBINS) { $ruler .= '+'; $vals2 .= $_; } } substr($ruler, $NBINS) = "" if $block && length($ruler) > $NBINS; substr($vals1, $NBINS) = "" if $block && length($vals1) > $NBINS; substr($vals2, $NBINS) = "" if $block && length($vals2) > $NBINS; substr($delim, $NBINS) = "" if $block && length($delim) > $NBINS; print "$ruler\n"; print "$vals1\n" if @notches; print "$vals2\n" if @dquery; print "$delim\n"; } if ($terse) { } else { my $bigbinpct = sprintf("%.1f", 100 * $bigbin / $ND_orig); my @info = (); push @info, "ND=$ND_orig$Loor$Roor hmin=$hmin hmax=$hmax dmin=$dmin dmax=$dmax"; $info[-1] .= " Description: $annot" if $annot && !defined($annotpos); push @info, "topbin(i=$bigbinid,n=$bigbin,p=$bigbinpct,x~$bigbinmid) bw=$width qr=($q1 $q2 $q3)"; if ($block) { for (@info) { if (length($_) > $NBINS) { substr($_, $NBINS-1) = '@' ; } elsif (length($_) < $NBINS) { $_ .= ' ' x ($NBINS - length($_)) ; } } } print map { "$_\n" } @info; } if (@dquery) { unshift @dquery, $dmin; push @dquery, $dmax; my $pivot = 0; my $sum = 0; print "Division bin sizes\n"; printf "%10s\t%10s\tCount\tPct\tSum\tPct\n", "Left", "Right"; for (my $i=1; $i<@dquery; $i++) { printf "%10s\t%10s\t", $dquery[$i-1], $dquery[$i]; my $count = 0; $pivot++ while $pivot < @D && $D[$pivot] < $dquery[$i-1]; $pivot++, $count++ while $pivot < @D && $D[$pivot] < $dquery[$i]; $count += $ND_left_cut if $i == 1; $count += $ND_right_cut if $i+1 == @dquery; $sum += $count; printf "%d\t%.2f\t%d\t%.2f\n", $count, 100*$count/$ND_orig, $sum, 100*$sum/$ND_orig; } }