#!/usr/bin/perl -w

=pod

HTX v0.7 - Hhtml To Xhtml Convertor

Copyright (C) 2004-2008 Jamie Cheetham
	Email: jamie at softham.co.uk

##############################################################

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

##############################################################

For further information see the README file.

=cut

my @entities = ('quot','amp','apos','lt','gt','nbsp','iexcl','cent','pound','curren','yen','brvbar','sect','uml','copy','ordf','laquo','not','shy','reg','macr','deg','plusmn','sup2','sup3','acute','micro','para','middot','cedil','sup1','ordm','raquo','frac14','frac12','frac34','iquest','Agrave','Aacute','Acirc','Atilde','Auml','Aring','AElig','Ccedil','Egrave','Eacute','Ecirc','Euml','Igrave','Iacute','Icirc','Iuml','ETH','Ntilde','Ograve','Oacute','Ocirc','Otilde','Ouml','times','Oslash','Ugrave','Uacute','Ucirc','Uuml','Yacute','THORN','szlig','agrave','aacute','acirc','atilde','auml','aring','aelig','ccedil','egrave','eacute','ecirc','euml','igrave','iacute','icirc','iuml','eth','ntilde','ograve','oacute','ocirc','otilde','ouml','divide','oslash','ugrave','uacute','ucirc','uuml','yacute','thorn','yuml','OElig','oelig','Scaron','scaron','Yuml','fnof','circ','tilde','Alpha','Beta','Gamma','Delta','Epsilon','Zeta','Eta','Theta','Iota','Kappa','Lambda','Mu','Nu','Xi','Omicron','Pi','Rho','Sigma','Tau','Upsilon','Phi','Chi','Psi','Omega','alpha','beta','gamma','delta','epsilon','zeta','eta','theta','iota','kappa','lambda','mu','nu','xi','omicron','pi','rho','sigmaf','sigma','tau','upsilon','phi','chi','psi','omega','thetasym','upsih','piv','ensp','emsp','thinsp','zwnj','zwj','lrm','rlm','ndash','mdash','lsquo','rsquo','sbquo','ldquo','rdquo','bdquo','dagger','Dagger','bull','hellip','permil','prime','Prime','lsaquo','rsaquo','oline','frasl','euro','image','weierp','real','trade','alefsym','larr','uarr','rarr','darr','harr','crarr','lArr','uArr','rArr','dArr','hArr','forall','part','exist','empty','nabla','isin','notin','ni','prod','sum','minus','lowast','radic','prop','infin','ang','and','or','cap','cup','int','there4','sim','cong','asymp','ne','equiv','le','ge','sub','sup','nsub','sube','supe','oplus','otimes','perp','sdot','lceil','rceil','lfloor','rfloor','lang','rang','loz','spades','clubs','hearts','diams');

use strict;
use Getopt::Long;

my ($dos, $mac, $multi, $help, $version, $verbose, $tty, $filename, $output) = '';
my $prog_version = '0.7.8';

# Handle command line arguments
GetOptions('dos' => \$dos,
		   'mac' => \$mac,
		   'multi' => \$multi,
		   'tty' => \$tty,
		   'help' => \$help,
		   'verbose|ver' => \$verbose,
		   'version|v' => \$version) || die "usage: htx [-h|-v] [-m|-d] [-ver] [-t] filename [output-filename]\n\n";
version() if ($version);
usage() if ($help);
die "ERROR: Cannot select DOS and Mac text files simultaneously.\n\n" if ($dos && $mac);
die "ERROR: No filename specified.\n\n" if (@ARGV == 0);

# Populate data arrays
my @empty_tags = qw/area base basefont br col frame hr img input isindex link meta param/;
my @min_attr = qw/compact checked declare readonly disabled selected defer ismap nohref noshade nowrap multiple noresize/;
my @old_doctypes1 = ('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Transitional//EN"\s*?>',
					 '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Strict//EN"\s*?>',
					 '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Frameset//EN"\s*?>');
my @old_doctypes2 = ('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Transitional//EN"'.
					 '\s+"http://www.w3.org/TR/html4/loose.dtd">',
					 '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Strict//EN"'.
					 '\s+"http://www.w3.org/TR/html4/strict.dtd">',
					 '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Frameset//EN"'.
					 '\s+"http://www.w3.org/TR/html4/frameset.dtd">');
my @new_doctypes = ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'."\n".
					'  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
					'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'."\n".
					'  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">',
					'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"'."\n".
					'  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">');

# Process each file in turn
foreach my $argument (@ARGV) {
	$filename = $argument;
	if ($multi) {$output = $filename;} else {$output = $ARGV[1] || $filename;}
	if ($multi and -d $filename) {
		opendir (DIR, $filename) or die "ERROR: Unbale to read directory: $filename.\n\n";
		chdir($filename);
		for(readdir(DIR)){
			next if(-d);
			$filename = $_;
			$output = $filename;
			convert();
		}
		closedir DIR;
		last;
	}
	if (!-e $filename) {
		warn "WARNING: File not found: $filename\n";
		next;
	}
	if (!-r $filename) {
		warn "WARNING: Unable to read file: $filename\n";
		next;
	}
	if (!-T $filename) {
		warn "WARNING: Cannot process non-text file: $filename\n";
		next;
	}
	convert();
	last if (!$multi);
}

print "Done.\n\n" if (!$tty);

exit;


sub convert {
	print "Opening $filename...\n" if ($verbose);

	# Load the file and slurp it into a string
	open (INPUT, "< $filename") or die "ERROR: Unable to read file: $filename.\n\n";
	my $string = do { local $/; <INPUT> };
	close INPUT;

	print "Processing $filename...\n" if ($verbose);

	# Remove CR characters to change from Windows to Linux line breaks
	$string =~ s/\r//g if ($dos);

	# Replace CR characters to change from Mac to Linux line breaks
	$string =~ s/\r/\n/g if ($mac);

	$string =~ s/ height="26"//g;
	$string =~ s/width="420"/width="250"/g;
	$string =~ s/width="150"/width="100"/g;
	$string =~ s/<td class="windowbg" valign="middle" align="center" width="100">/<td class="windowbg" style="border-left: 1px solid #6e94b7; text-align: center; vertical-align: middle; width: 100px">/;
	$string =~ s/<img src="http:\/\/www\.bigbluecup\.com\/yabb\/Smileys\/default\/([^\.]+)\.gif"/<img style="vertical-align: middle" src="img\/Dark Smileys\/$1.png"/g;
	$string =~ s/rolleyes.png/rolleyes.gif/g;
	$string =~ s/<table border="0" cellpadding="4" cellspacing="1" align="center" class="bordercolor">/<table cellpadding="4" cellspacing="1" class="bordercolor">/;
	$string =~ s/<tr><td valign="top"><b>My games:<\/b>/<tr class="games"><td valign="top"><b>My games:<\/b>/;
	for($string =~ /\&.{9}/g){
		my $amper = $_;
		my $new_amper = $amper;
		if($new_amper =~ /;/){
			if($new_amper !~ /\&#\d+;/){
				my $donothing = 0;
				for(@entities){
					$donothing = 1 if($new_amper =~ /\&$_;/);
				}
				$new_amper =~ s/\&/&amp;/g unless($donothing);
			}
		}
		else {
			$new_amper =~ s/\&/&amp;/g;
		}
		$string =~ s/\Q$amper\E/$new_amper/g;
	}

	# Warn about old ICRA data
	if ($verbose) {
		if ($string =~ m/<meta http-equiv="pics-label" /i) {
			warn "WARNING: Old pics-label tag detected. ".
				 "It is recommended that you regenerate it at http://www.icra.org/label/\n";
		}
	}

	# Process each tag containing a = individually, ignoring ones starting with <? or <! or <%
	my @tags = ($string =~ m/(<[^?!%][^>]+?=.+?>)/gm);
	foreach my $tag (@tags) {
		my $new_tag = $tag;
	   
		# Double quote unquoted alphanumeric attribute values
		$new_tag =~ s/(\w+?)='([^='"]+?)'([ |>])/$1="$2"$3/g;
		$new_tag =~ s/(\w+?)=([^\s"']+?)([ |>])/$1="$2"$3/g;

		# Make chars between < and =", containing no ", lowercase
		$new_tag =~ s/^(<[^"]+?)="/\L$1\E="/;

		# Make chars between "  and =", containing no ", lowercase
		$new_tag =~ s/"(\s)([^"]+?)="/"$1\L$2\E="/g;

		# Make chars between " and >, containing no ", lowercase
		$new_tag =~ s/("[^"]+?) *?>$/\L$1\E>/;

		# Make the values of the align, valign and shape properties lowercase
		$new_tag =~ s/ (v?)align="(.+?)"/ $1align="\L$2\E"/g;
		$new_tag =~ s/ shape=\"(.+?)"/ shape="\L$1\E"/;
		
		$string =~ s/\Q$tag\E/$new_tag/g;
	}

	# Make chars between < and >, containing no " and not starting with <! or <? or <%, lowercase
	$string =~ s/(<[^?!%][^"]*?>)/\L$1\E/g;

	# Add closing slash to empty tags
	foreach (@empty_tags) {
		$string =~ s/(<$_.*?)("?) ?>/$1$2 \/>/gs;
	}
	$string =~ s# / /># />#g;

	# Process each and every tag individually, ignoring ones starting with <? or <! or <%
	@tags = ($string =~ m/(<[^?!%][^>]+?>)/gm);
	foreach my $tag (@tags) {
		my $new_tag = $tag;

		# Correct attribute minimization
		foreach (@min_attr) {
			$new_tag =~ s/ $_([ |>])/ $_="$_"$1/g;
		}

		# Make hex colour codes lowercase
		$new_tag =~ s/(\#[A-Fa-f0-9]{3})/\L$1\E/g;
		$new_tag =~ s/(\#[A-Fa-f0-9]{6})/\L$1\E/g;
		
		# Convert attributes to styles
		my @styles = ();
		if($new_tag =~ s/ width="([^"]+)"//){
			my $value = $1;
			my $px = '';
			$px = 'px' if($1 !~ /\%$/);
			push(@styles, "width: $value$px;");
		}
		if($new_tag =~ s/ height="([^"]+)"//){
			my $value = $1;
			my $px = '';
			$px = 'px' if($1 !~ /\%$/);
			push(@styles, "height: $value$px;");
		}
		if($new_tag =~ s/ size="([^"]+)"//){
			my $value = $1;
			my $px = '';
			$px = 'px' if($1 !~ /\%$/);
			push(@styles, "height: $value$px;");
		}
		if($new_tag =~ s/ valign="([^"]+)"//){
			my $value = $1;
			push(@styles, "vertical-align: $value;");
		}
		if($new_tag =~ s/ align="([^"]+)"//){
			my $value = $1;
			push(@styles, "text-align: $value;");
		}
		if($new_tag =~ s/ border="([^"]+)"//){
			my $value = $1;
			$value = 'none' if($value eq 0);
			push(@styles, "border: $value;");
		}
		my $style = join(' ', @styles);
		if(@styles){
			if($new_tag =~ /style=/){
				$new_tag =~ s/style="/style="$style/;
			}
			else {
				$new_tag =~ s/(\/?>)/ style="$style" $1/;
			}
		}
		
		# Remove link targets
		$new_tag =~ s/ target="[^"]*"//;

		$new_tag =~ s/  +/ /g;


		# Check for unmatched double quotes
		if (($tag =~ tr/"//) & 1) {warn "WARNING: $tag is potentially invalid\n";}

		$string =~ s/\Q$tag\E/$new_tag/g;
	}

	# Change the name attribute to id in <a> and <map> tags
	$string=~ s/<(a|map)( |.*?)name="(.*?)"( |>)/<$1$2id="$3"$4/g;

	# Change the value of the clear attribute to lowercase in <br> tags
	$string=~ s/<br( |.*?)clear="(.*?)" /<br$1clear="\L$2\E" /g;

	# Update the HTML tag itself
	$string =~ s/<html.*?>/<html xmlns="http:\/\/www.w3.org\/1999\/xhtml" xml:lang="en">/g;

	# Update doctype or add if missing
	#for(my $count = 0; $count < 3 ; $count++) {
	#	last if ($string =~ s/$old_doctypes1[$count]/$new_doctypes[$count]/i);
	#	last if ($string =~ s/$old_doctypes2[$count]/$new_doctypes[$count]/i);
	#}
	#if (!($string =~ m/<!DOCTYPE /i)) {$string = $new_doctypes[0]."\n\n".$string;}

	print "Writing $output...\n" if ($verbose);

	# Output the new file
	if ($tty) {print $string;}
	else {
		open (OUTPUT, ">$output") or die "ERROR: Unable to write to $output\n\n";
		binmode OUTPUT;
		print OUTPUT $string;
		close OUTPUT;
	}
}

sub version {
	print "HTX version $prog_version, Copyright (C) 2004-2008 Jamie Cheetham\n";
	print "Softham: http://www.softham.co.uk/\n\n";
	exit;
}

sub usage {
	print << "EOF";
HTML To XHTML Convertor $prog_version, Copyright (C) 2004-2008 Jamie Cheetham

Usage: htx [--dos|--mac] [--verbose] <filename> [<output filename>]
	   htx [--dos|--mac] [--verbose] [--multi] <file pattern1> [<file pattern2> ...] 
	   htx [--help|--version]

If the output filename isn't specified in single-file mode, the initial file is
overwritten with the updated code.

Options:

  -d --dos
	Convert line breaks from Windows text files to use in Unix/Linux.

  -ma --mac
	Convert line breaks from Mac text files to use in Unix/Linux.

  -mu --multi
	Process multiple files simultaneously and overwrite them.

  -h --help
	Display this help and exit.

  -ver --verbose
	Display extra information while processing.

  -v --version
	Display version number and exit.

EOF
	exit;
}

__END__
