diff options
author | Vincent Ambo <Vincent Ambo> | 2020-01-11T23·36+0000 |
---|---|---|
committer | Vincent Ambo <Vincent Ambo> | 2020-01-11T23·36+0000 |
commit | 1b593e1ea4d2af0f6444d9a7788d5d99abd6fde5 (patch) | |
tree | e3accb9beed5c4c1b5a05c99db71ab2841f0ed04 /contrib/stats |
Squashed 'third_party/git/' content from commit cb71568594
git-subtree-dir: third_party/git git-subtree-split: cb715685942260375e1eb8153b0768a376e4ece7
Diffstat (limited to 'contrib/stats')
-rwxr-xr-x | contrib/stats/git-common-hash | 26 | ||||
-rwxr-xr-x | contrib/stats/mailmap.pl | 70 | ||||
-rwxr-xr-x | contrib/stats/packinfo.pl | 212 |
3 files changed, 308 insertions, 0 deletions
diff --git a/contrib/stats/git-common-hash b/contrib/stats/git-common-hash new file mode 100755 index 000000000000..e27fd088be1b --- /dev/null +++ b/contrib/stats/git-common-hash @@ -0,0 +1,26 @@ +#!/bin/sh + +# This script displays the distribution of longest common hash prefixes. +# This can be used to determine the minimum prefix length to use +# for object names to be unique. + +git rev-list --objects --all | sort | perl -lne ' + substr($_, 40) = ""; + # uncomment next line for a distribution of bits instead of hex chars + # $_ = unpack("B*",pack("H*",$_)); + if (defined $p) { + ($p ^ $_) =~ /^(\0*)/; + $common = length $1; + if (defined $pcommon) { + $count[$pcommon > $common ? $pcommon : $common]++; + } else { + $count[$common]++; # first item + } + } + $p = $_; + $pcommon = $common; + END { + $count[$common]++; # last item + print "$_: $count[$_]" for 0..$#count; + } +' diff --git a/contrib/stats/mailmap.pl b/contrib/stats/mailmap.pl new file mode 100755 index 000000000000..9513f5e35b44 --- /dev/null +++ b/contrib/stats/mailmap.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl + +use warnings 'all'; +use strict; +use Getopt::Long; + +my $match_emails; +my $match_names; +my $order_by = 'count'; +Getopt::Long::Configure(qw(bundling)); +GetOptions( + 'emails|e!' => \$match_emails, + 'names|n!' => \$match_names, + 'count|c' => sub { $order_by = 'count' }, + 'time|t' => sub { $order_by = 'stamp' }, +) or exit 1; +$match_emails = 1 unless $match_names; + +my $email = {}; +my $name = {}; + +open(my $fh, '-|', "git log --format='%at <%aE> %aN'"); +while(<$fh>) { + my ($t, $e, $n) = /(\S+) <(\S+)> (.*)/; + mark($email, $e, $n, $t); + mark($name, $n, $e, $t); +} +close($fh); + +if ($match_emails) { + foreach my $e (dups($email)) { + foreach my $n (vals($email->{$e})) { + show($n, $e, $email->{$e}->{$n}); + } + print "\n"; + } +} +if ($match_names) { + foreach my $n (dups($name)) { + foreach my $e (vals($name->{$n})) { + show($n, $e, $name->{$n}->{$e}); + } + print "\n"; + } +} +exit 0; + +sub mark { + my ($h, $k, $v, $t) = @_; + my $e = $h->{$k}->{$v} ||= { count => 0, stamp => 0 }; + $e->{count}++; + $e->{stamp} = $t unless $t < $e->{stamp}; +} + +sub dups { + my $h = shift; + return grep { keys($h->{$_}) > 1 } keys($h); +} + +sub vals { + my $h = shift; + return sort { + $h->{$b}->{$order_by} <=> $h->{$a}->{$order_by} + } keys($h); +} + +sub show { + my ($n, $e, $h) = @_; + print "$n <$e> ($h->{$order_by})\n"; +} diff --git a/contrib/stats/packinfo.pl b/contrib/stats/packinfo.pl new file mode 100755 index 000000000000..be188c0f11db --- /dev/null +++ b/contrib/stats/packinfo.pl @@ -0,0 +1,212 @@ +#!/usr/bin/perl +# +# This tool will print vaguely pretty information about a pack. It +# expects the output of "git verify-pack -v" as input on stdin. +# +# $ git verify-pack -v | packinfo.pl +# +# This prints some full-pack statistics; currently "all sizes", "all +# path sizes", "tree sizes", "tree path sizes", and "depths". +# +# * "all sizes" stats are across every object size in the file; +# full sizes for base objects, and delta size for deltas. +# * "all path sizes" stats are across all object's "path sizes". +# A path size is the sum of the size of the delta chain, including the +# base object. In other words, it's how many bytes need be read to +# reassemble the file from deltas. +# * "tree sizes" are object sizes grouped into delta trees. +# * "tree path sizes" are path sizes grouped into delta trees. +# * "depths" should be obvious. +# +# When run as: +# +# $ git verify-pack -v | packinfo.pl -tree +# +# the trees of objects are output along with the stats. This looks +# like: +# +# 0 commit 031321c6... 803 803 +# +# 0 blob 03156f21... 1767 1767 +# 1 blob f52a9d7f... 10 1777 +# 2 blob a8cc5739... 51 1828 +# 3 blob 660e90b1... 15 1843 +# 4 blob 0cb8e3bb... 33 1876 +# 2 blob e48607f0... 311 2088 +# size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 +# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 +# +# The first number after the sha1 is the object size, the second +# number is the path size. The statistics are across all objects in +# the previous delta tree. Obviously they are omitted for trees of +# one object. +# +# When run as: +# +# $ git verify-pack -v | packinfo.pl -tree -filenames +# +# it adds filenames to the tree. Getting this information is slow: +# +# 0 blob 03156f21... 1767 1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142 +# 1 blob f52a9d7f... 10 1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74 +# 2 blob a8cc5739... 51 1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0 +# 3 blob 660e90b1... 15 1843 Documentation/git-lost+found.txt @ master~3222^2~2 +# 4 blob 0cb8e3bb... 33 1876 Documentation/git-lost+found.txt @ master~3222^2~3 +# 2 blob e48607f0... 311 2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4 +# size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 +# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 +# +# When run as: +# +# $ git verify-pack -v | packinfo.pl -dump +# +# it prints out "sha1 size pathsize depth" for each sha1 in lexical +# order. +# +# 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7 +# 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4 +# 000182eacf99cde27d5916aa415921924b82972c 499 499 0 +# ... +# +# This is handy for comparing two packs. Adding "-filenames" will add +# filenames, as per "-tree -filenames" above. + +use strict; +use Getopt::Long; + +my $filenames = 0; +my $tree = 0; +my $dump = 0; +GetOptions("tree" => \$tree, + "filenames" => \$filenames, + "dump" => \$dump); + +my %parents; +my %children; +my %sizes; +my @roots; +my %paths; +my %types; +my @commits; +my %names; +my %depths; +my @depths; + +while (<STDIN>) { + my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_); + next unless ($sha1 =~ /^[0-9a-f]{40}$/); + $depths{$sha1} = $depth || 0; + push(@depths, $depth || 0); + push(@commits, $sha1) if ($type eq 'commit'); + push(@roots, $sha1) unless $parent; + $parents{$sha1} = $parent; + $types{$sha1} = $type; + push(@{$children{$parent}}, $sha1); + $sizes{$sha1} = $size; +} + +if ($filenames && ($tree || $dump)) { + open(NAMES, "git name-rev --all|"); + while (<NAMES>) { + if (/^(\S+)\s+(.*)$/) { + my ($sha1, $name) = ($1, $2); + $names{$sha1} = $name; + } + } + close NAMES; + + for my $commit (@commits) { + my $name = $names{$commit}; + open(TREE, "git ls-tree -t -r $commit|"); + print STDERR "Plumbing tree $name\n"; + while (<TREE>) { + if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) { + my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4); + $paths{$sha1} = "$path @ $name"; + } + } + close TREE; + } +} + +sub stats { + my @data = sort {$a <=> $b} @_; + my $min = $data[0]; + my $max = $data[$#data]; + my $total = 0; + my $count = scalar @data; + for my $datum (@data) { + $total += $datum; + } + my $mean = $total / $count; + my $median = $data[int(@data / 2)]; + my $diff_sum = 0; + for my $datum (@data) { + $diff_sum += ($datum - $mean)**2; + } + my $std_dev = sqrt($diff_sum / $count); + return ($count, $total, $min, $max, $mean, $median, $std_dev); +} + +sub print_stats { + my $name = shift; + my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_); + printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n", + $name, $count, $total, $min, $max, $mean, $median, $std_dev); +} + +my @sizes; +my @path_sizes; +my @all_sizes; +my @all_path_sizes; +my %path_sizes; + +sub dig { + my ($sha1, $depth, $path_size) = @_; + $path_size += $sizes{$sha1}; + push(@sizes, $sizes{$sha1}); + push(@all_sizes, $sizes{$sha1}); + push(@path_sizes, $path_size); + push(@all_path_sizes, $path_size); + $path_sizes{$sha1} = $path_size; + if ($tree) { + printf("%3d%s %6s %s %8d %8d %s\n", + $depth, (" " x $depth), $types{$sha1}, + $sha1, $sizes{$sha1}, $path_size, $paths{$sha1}); + } + for my $child (@{$children{$sha1}}) { + dig($child, $depth + 1, $path_size); + } +} + +my @tree_sizes; +my @tree_path_sizes; + +for my $root (@roots) { + undef @sizes; + undef @path_sizes; + dig($root, 0, 0); + my ($aa, $sz_total) = stats(@sizes); + my ($bb, $psz_total) = stats(@path_sizes); + push(@tree_sizes, $sz_total); + push(@tree_path_sizes, $psz_total); + if ($tree) { + if (@sizes > 1) { + print_stats(" size", @sizes); + print_stats("path size", @path_sizes); + } + print "\n"; + } +} + +if ($dump) { + for my $sha1 (sort keys %sizes) { + print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n"; + } +} else { + print_stats(" all sizes", @all_sizes); + print_stats(" all path sizes", @all_path_sizes); + print_stats(" tree sizes", @tree_sizes); + print_stats("tree path sizes", @tree_path_sizes); + print_stats(" depths", @depths); +} |