docket_match

#!/bin/env perl
$|=1;
use strict;
use JSON;
use Scalar::Util qw(looks_like_number);
my $lphbin = "/users/gglusman/proj/DOCKET/data-fingerprints";
my $cutoff = 0.8;
my(@files) = @ARGV;

# Collect dockets to be matched
my @dockets;
foreach my $file (@files) {
	if ($file =~ /^@/) {
		open F, $file;
		while (<F>) {
			chomp;
			my($d) = split /\t/;
			if (looks_like_a_docket($d)) {
				push @dockets, $d;
			} else {
				print "#doesn't look like a docket: $d\n";
			}
		}
		close F;
	} else {
		if (looks_like_a_docket($file)) {
			push @dockets, $file;
		} else {
			print "#doesn't look like a docket: $file\n";
		}
	}
}
die "Not enough dockets to match\n" unless scalar @dockets>1;

# Step 1: find candidate matches quickly via comparisons of column histogram fingerprints
my @cand;
foreach my $i (0..$#dockets-1) {
	foreach my $j ($i+1..$#dockets) {
		my(@matching, @seen);
		open MATCH, "$lphbin/searchLPHs.pl $dockets[$i]/fingerprints/cols_hist_fp.fp $dockets[$j]/fingerprints/cols_hist_fp.fp |";
		while (<MATCH>) {
			chomp;
			my($q, $t, $c) = split /\t/;
			last if $c<$cutoff;
			next if $seen[0]{$q};
			next if $seen[1]{$t};
			push @matching, [$q, $t];
			$seen[0]{$q}++;
			$seen[1]{$t}++;
		}
		close MATCH;
		if (@matching) {
			$cand[$i][$j] = \@matching;
		}
	}
}

# Step 2: trim candidate matches by computing Jaccards (categorical) or two-sample Kolmogorov-Smirnov tests (numerical)

my @comp;
foreach my $i (0..$#dockets-1) {
	next unless defined $cand[$i];
	my $datai = read_json("$dockets[$i]/data/cols_hist.json.gz");
	foreach my $j ($i+1..$#dockets) {
		next unless defined $cand[$i][$j];
		my $dataj = read_json("$dockets[$j]/data/cols_hist.json.gz");
		#print $dockets[$i], "\n";
		#print $dockets[$j], "\n";
		#print "Columns matching: ", scalar @{$cand[$i][$j]}, "\n";
		my(%stat, %type, %method);
		my %comp = ('query' => $dockets[$i], 'target' => $dockets[$j]);
		foreach my $match (@{$cand[$i][$j]}) {
			my $di = $datai->{$match->[0]};
			delete $di->{'NA'};
			delete $di->{''};
			next unless keys %$di;
			my $dj = $dataj->{$match->[1]};
			delete $dj->{'NA'};
			delete $dj->{''};
			next unless keys %$dj;
			my $typei = datatype($di);
			my $typej = datatype($dj);
			my $pair = join("\t", $match->[0], $match->[1]);
			if ($typei eq $typej) {
				if ($typei eq 'num') {
					$type{$pair} = 'num';
					$stat{$pair} = 1-KS2s($di, $dj, $pair);
					$method{$pair} = 'two-sample Kolmogorov-Smirnov';
				} elsif ($typei eq 'str') {
					$type{$pair} = 'str';
					$stat{$pair} = weighted_jaccard($di, $dj);
					$method{$pair} = 'weighted Jaccard'
				} else {
					$type{$pair} = 'mixed';
					$stat{$pair} = 0;
				}
			} else {
				$type{$pair} = '${typei}_$typej';
				$stat{$pair} = 0;
			}
			#next unless $jaccard>0.1 || $int_of_i>0.1 || $int_of_j>0.1;
			#print join("\t", $match->[0], $match->[1], $jaccard, $int_of_i, $int_of_j), "\n";
		}
		my @used;
		foreach my $pair (sort {$stat{$b}<=>$stat{$a}} keys %stat) {
			last if $stat{$pair}<0.75;
			my($m0, $m1) = split /\t/, $pair;
			next if $used[0]{$m0} || $used[1]{$m1};
			$used[0]{$m0} = $used[1]{$m1} = 1;
			#next if $m0 eq $m1;
			#print join("\t", $stat{$pair}, $type{$pair}, $pair), "\n";
			push @{$comp{'column_matches'}}, {
				'query' => $m0,
				'target' => $m1,
				'stat' => $method{$pair},
				'value' => sprintf("%.2f", $stat{$pair}),
				'type' => $type{$pair},
			};
		}
		#print "\n";
		$comp{'matching_columns'} = scalar @{$comp{'column_matches'}};
		push @comp, \%comp;
	}
}

print to_json(\@comp, {pretty=>1}), "\n";

sub datatype {
	my($what) = @_;
	my($num, $str);
	while (my($key) = each %$what) {
		next unless $key;
		next if $key eq 'NA';
		if (looks_like_number($key)) {
			$num++;
		} else {
			$str++;
		}
	}
	return 'num' unless $str;
	return 'str' unless $num;
	return 'mixed';
}


sub KS2s { #two-sample Kolmogorov-Smirnov test, approximate
	#expects input in form of two references to hashes representing histograms to be compared
	### could be made more precise by interpolating missing values
	my($di, $dj) = @_;
	
	my(%cumul, $ti, $tj);
	foreach my $key (sort {$a<=>$b} keys %$di) {
		$ti += $di->{$key};
		$cumul{$key}[0] = $ti;
	}
	foreach my $key (sort {$a<=>$b} keys %$dj) {
		$tj += $dj->{$key};
		$cumul{$key}[1] = $tj;
	}
	my($ci, $cj, $max, $d);
	foreach my $key (sort {$a<=>$b} keys %cumul) {
		$ci = ($cumul{$key}[0]/$ti) || $ci;
		$cj = ($cumul{$key}[1]/$tj) || $cj;
		$d = abs($ci-$cj);
		$max = $d if $d>$max;
	}
	return $max;
}

sub KS2sMod { #modified two-sample Kolmogorov-Smirnov test, approximate
	#expects input in form of two references to hashes representing histograms to be compared
	### could be made more precise by interpolating missing values
	my($di, $dj, $pair) = @_;
	
	my @sorti = sort {$a<=>$b} keys %$di;
	my @sortj = sort {$a<=>$b} keys %$dj;
	return 1 if $sorti[0]>$sortj[-1] || $sortj[0]>$sorti[-1];
	my(%cumul, $ti, $tj);
	foreach my $key (@sorti) {
		$ti += $di->{$key};
		$cumul{$key}[0] = $ti;
	}
	foreach my $key (sort {$a<=>$b} keys %$dj) {
		$tj += $dj->{$key};
		$cumul{$key}[1] = $tj;
	}
	my($ci, $cj, $max, $d);
	my($sum, $n, $prev);
	foreach my $key (sort {$a<=>$b} keys %cumul) {
		$ci = ($cumul{$key}[0]/$ti) || $ci;
		$cj = ($cumul{$key}[1]/$tj) || $cj;
		$d = abs($ci-$cj);
		$sum += $d*($key-$prev);
		$n += ($key-$prev);
	#	print join("\t", $key, $ci, $cj, $d, $sum, $pair), "\n";
		$prev = $key;
	}
	#print "##final: ", $sum/$n, "\n";
	return $sum/$n;
}


sub weighted_jaccard { #Jaccard comparison for categorical data
	#expects input in form of two references to hashes representing histograms to be compared
	my($di, $dj) = @_;
	
	my %union = %$di;
	my %intersection = %$di;
	while (my($key, $value) = each %$di) {
		my $jvalue = $dj->{$key};
		if (defined $jvalue) {
			if ($jvalue > $intersection{$key}) {
				$union{$key} = $jvalue;
			} else {
				$intersection{$key} = $jvalue;
			}
		} else {
			delete $intersection{$key};
		}
	}
	while (my($key, $value) = each %$dj) {
		$union{$key} = $value unless defined $intersection{$key};
	}
	
	my $int_size = 0;
	$int_size += $_ foreach values %intersection;
	my $union_size = 0;
	$union_size += $_ foreach values %union;
	return $int_size / $union_size;
}


sub jaccard {
	my($di, $dj) = @_;
	
	my %union = %$di;
	my %intersection;
	while (my($key) = each %$di) {
		$intersection{$key}++ if defined $dj->{$key};
	}
	while (my($key) = each %$dj) {
		$union{$key}++ unless defined $union{$key};
	}
	my $int_size = scalar keys %intersection;
	my $union_size = scalar keys %union;
	return $int_size / $union_size;
	#my $int_of_i = $int_size / scalar keys %$di;
	#my $int_of_j = $int_size / scalar keys %$dj;
}


#########################
sub read_json {
	my($file) = @_;
	my $json;
	open J, "gunzip -c $file |";
	while (<J>) {
		chomp;
		$json .= $_;
	}
	close J;
	return decode_json($json);
}


sub looks_like_a_docket {
	my($dir) = @_;
	
	return 1 if
		-d $dir && 
		-d "$dir/analyses" && 
		-d "$dir/fingerprints" && 
		-d "$dir/comparisons" && 
		-d "$dir/visualizations";
}