#!/usr/bin/env perl use 5.016; use warnings; use autodie; use charnames ':full'; use Encode qw(decode); use JSON::PP; use Pod::Usage; if (!@ARGV) { pod2usage('Missing arguments.'); } elsif (grep { /^--?h(elp)?$/ } @ARGV) { pod2usage(-verbose => 2); } my %COMMIT_ARGS = ( hash => '%H', author_name => '%aN', author_mail => '%aE', author_date => '%at', committer_name => '%cN', committer_mail => '%cE', committer_date => '%ct', subject => '%s', body => '%b', notes => '%N', signer => '%GS', signer_key => '%GK', ); my @COMMIT_KEYS = sort keys %COMMIT_ARGS; my @FORMAT_ARGS = @COMMIT_ARGS{@COMMIT_KEYS}; my $COMMIT_SEPARATOR = "\N{RECORD SEPARATOR}"; my $DIFF_SEPARATOR = "\N{UNIT SEPARATOR}"; # See the “Internals” in the documentation (run `perldoc parseman`) my $FORMAT = $COMMIT_SEPARATOR . join('%x00', @FORMAT_ARGS) . $DIFF_SEPARATOR; sub parse_log { my ($log) = @_; my @fields = split "\0", $log, -1; return map { $COMMIT_KEYS[$_] => $fields[$_] } 0 .. $#fields; } sub parse_diff { my ($diff) = @_; my %changes = ( touched_files => 0, insertions => 0, deletions => 0, ); for my $diff_line (split "\n", $diff) { next unless $diff_line =~ /^(\d+)\s+(\d+)\s+(.+?)\s*$/; $changes{insertions} += $1; $changes{deletions } += $2; $changes{touched_files}++; } return %changes; } my @git_args; if ($ENV{IDMAN_BEFORE}) { push @git_args, "--before=$ENV{IDMAN_BEFORE}"; } for my $repo_path (@ARGV) { open my $fh, '-|', 'git', "--git-dir=$repo_path/.git", 'log', '--all', "--format=$FORMAT", '-M', '-C', '--numstat', @git_args; read $fh, $_, length $COMMIT_SEPARATOR; # strip leading separator local $/ = "\n$COMMIT_SEPARATOR"; while (<$fh>) { chomp; my ($log, $diff) = split $DIFF_SEPARATOR, decode('UTF-8', $_); say encode_json({ repo => $repo_path, parse_log($log), parse_diff($diff), }); } close $fh; } __END__ =head1 NAME parseman - parse commit info from a git directory =head1 SYNOPSIS parseman GIT_REPO_FOLDER... This will parse all commits from the given git repo(s) and dump the results as JSON objects on stdout, one per line. =head1 DESCRIPTION This script will take a git repository and run C on it, transforming its output into structured JSON. This output can be piped into another program. The output consists of several JSON-encoded objects, one object per line. The consuming program can just read one line at a time, JSON-decode each one and then handle its contents. For example output, just run C<./parseman .> to see the output for this here repository. Each JSON object is just a mapping from string keys to string values. Empty values will be empty strings. The following values are in the object: =over =item repo The path to the repository's local folder. If you want to run further git commands on it, you might need to append C to it. =item hash The commit's sha-1 hash. =item author_name The name of the commit's author. =item author_mail The e-mail address of the author. Might not actually be a valid e-mail address, git doesn't check this. =item author_date The date that the commit was authored as a Unix timestamp. Note that this is a string of digits, not an integer. =item committer_name The name of the committer, which may or may not be different from the author. =item committer_mail The e-mail address of the committer. Same caveat as for the author applies. =item committer_date The date that the commit was committed. =item subject The commit message subject line. =item body The rest of the commit message. =item notes The notes attached to the commit. Basically a message in addition to the regular commit message. =item signer Who signed the commit, if anybody did. =item signer_key The signature key of who signed the commit. =item touched_files =item insertions =item deletions The amount of modified files, inserted lines and deleted lines in the commit, respectively. Renamed files are taken into account properly, so a rename on its own counts as a single changed file with zero inserted or deleted lines. See the C<--find-renames> and C<--find-copies> options in C for details. =back =head1 INTERNALS Calling C on every commit would be the easiest implementation for this script, but that is just prohibitively slow. So instead, we run it only one single time with a specially formatted output so that we can separate each commit properly. The format works as follows: =over =item Each commit is prefixed with a C character. This separates the commits from each other. =item Each of the fields within a commit (hash, message, author name etc.) is separated with a C character. =item After those fields follows a C, after which git automatically puts the diff information (changed files and lines). =back Parsing the format works by splitting the input by Cs and splitting each of those records by Cs into a fields portion and a diff portion. The fields are then simply split by Cs and the diff is parsed via regular expression. =cut