#!/usr/bin/perl

use XML::LibXML;
require "../../../scripts/gather/billdiff.pl";

my $xmlparser = XML::LibXML->new();
$xmlparser->keep_blanks(1);

$dir = '.';

if (0) {
	# This is for the first bailout bill, Sept-Oct 2008.
	
	@docs = (
		AYO08B68,
		AYO08B94,
		AYO08C04,
		AMEND001,
		AYO08C32);

	%descriptions = (
		AYO08B68 => 'a draft posted Thursday, September 25',
		AYO08B94 => 'a draft posted Saturday, September 27 (but dated the 28th)',
		AYO08C04 => 'a draft posted Sunday, September 28',
		AMEND001 => 'the text of the plan as of September 28 at 9:10pm, which failed House vote on Monday, September 29 at 2pm (an amendment to set the text of H.R. 3997)',
		AYO08C32 => 'the text of the bill H.R. 1424 approved by the Senate the night of Wednesday, October 1',
		);

	$outdir = 'comparison';

} elsif (1) {
	# This is for the second bailout bill Jan-Feb 2009.

}

$compindex = $xmlparser->parse_string('<index/>');

$a = textify($docs[0]);
for (my $i = 1; $i <= scalar(@docs)-1; $i++) {
	print "$i...\n";
	
	$b = textify($docs[$i]);
	my $c = RichDiff($a, $b);

	$n = $compindex->createElement('entry');
	$compindex->documentElement->appendChild($n);
	$n->setAttribute('id', $i);

	for my $nn ($c->documentElement, $n) {
		$nn->setAttribute('left-id', $docs[$i-1]);
		$nn->setAttribute('left-description', $descriptions{$docs[$i-1]});
	
		$nn->setAttribute('right-id', $docs[$i]);
		$nn->setAttribute('right-description', $descriptions{$docs[$i]});
	}
	
	open X, ">$dir/$outdir/$i.xml";
	print X $c->toString();
	close X;
	
	system("xsltproc bailoutbills.xsl $dir/$outdir/$i.xml > $dir/$outdir/$i.html");
	
	$a = $b;
}

open COMPINDEX, ">$dir/$outdir/index.xml";
print COMPINDEX $compindex->toString(2);
close COMPINDEX;

sub textify {
	my $fn = shift;
	system("pdftotext -layout -enc UTF-8 -nopgbrk $dir/pdf/$fn.pdf");
	
	my $xml = "<document>\n";
	open F, "<$dir/pdf/$fn.txt";
	while ($line = <F>) {
		chop $line;
		
		if ($line =~ /^\s*\[Discussion Draft\]\s*$/) { next; }
		if ($line =~ /^\s*\d+\s*$/) { next; } # probably page number by itself
		if ($line =~ /VerDate/) { next; }
		if ($line =~ s/F:\\//i) { next; }
		
		$line =~ s|O:\\AYO\\AYO08.*\.xml||; # remove file name
		$line =~ s/September 28, 2008 \(9:10 p.m.\)//;
		$line =~ s/^\s*(\d|\d\d) //; # get rid of line numbers at starts of lines
		
		if ($line !~ /\S/) { next; } # skip blank lines since they appear in funny places
		
		# unwrap lines broken by hyphenation
		while ($line =~ s/-$//) {
			my $line2 = <F>;
			chop $line2;
			$line2 =~ s/^\s*(\d|\d\d)\s+//; # get rid of line numbers and indentation at start of next line
			$line .= $line2;
		}

		$line = "<line>$line</line>\n";
		$line =~ s/^<line>( +)/"<line indent=\"" . indent($1) . "\">"/eg; # make intendation fairly hard
		$line =~ s/  / /g;
		
		
		$xml .= $line;
	}
	$xml .= "</document>\n";
	
	$xml =~ s/ø/\&lt;\&lt;/g;
	$xml =~ s/¿/\&gt;\&gt;/g;
	
	return $xmlparser->parse_string($xml);
}

sub indent {
	my $x = length($_[0]);
	if ($x > 15) { $x /= 5; }
	return $x;
}
