#!/usr/bin/perl -w =pod HTX v0.7 - Hhtml To Xhtml Convertor Copyright (C) 2004-2008 Jamie Cheetham Email: jamie at softham.co.uk ############################################################## This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ############################################################## For further information see the README file. =cut my @entities = ('quot','amp','apos','lt','gt','nbsp','iexcl','cent','pound','curren','yen','brvbar','sect','uml','copy','ordf','laquo','not','shy','reg','macr','deg','plusmn','sup2','sup3','acute','micro','para','middot','cedil','sup1','ordm','raquo','frac14','frac12','frac34','iquest','Agrave','Aacute','Acirc','Atilde','Auml','Aring','AElig','Ccedil','Egrave','Eacute','Ecirc','Euml','Igrave','Iacute','Icirc','Iuml','ETH','Ntilde','Ograve','Oacute','Ocirc','Otilde','Ouml','times','Oslash','Ugrave','Uacute','Ucirc','Uuml','Yacute','THORN','szlig','agrave','aacute','acirc','atilde','auml','aring','aelig','ccedil','egrave','eacute','ecirc','euml','igrave','iacute','icirc','iuml','eth','ntilde','ograve','oacute','ocirc','otilde','ouml','divide','oslash','ugrave','uacute','ucirc','uuml','yacute','thorn','yuml','OElig','oelig','Scaron','scaron','Yuml','fnof','circ','tilde','Alpha','Beta','Gamma','Delta','Epsilon','Zeta','Eta','Theta','Iota','Kappa','Lambda','Mu','Nu','Xi','Omicron','Pi','Rho','Sigma','Tau','Upsilon','Phi','Chi','Psi','Omega','alpha','beta','gamma','delta','epsilon','zeta','eta','theta','iota','kappa','lambda','mu','nu','xi','omicron','pi','rho','sigmaf','sigma','tau','upsilon','phi','chi','psi','omega','thetasym','upsih','piv','ensp','emsp','thinsp','zwnj','zwj','lrm','rlm','ndash','mdash','lsquo','rsquo','sbquo','ldquo','rdquo','bdquo','dagger','Dagger','bull','hellip','permil','prime','Prime','lsaquo','rsaquo','oline','frasl','euro','image','weierp','real','trade','alefsym','larr','uarr','rarr','darr','harr','crarr','lArr','uArr','rArr','dArr','hArr','forall','part','exist','empty','nabla','isin','notin','ni','prod','sum','minus','lowast','radic','prop','infin','ang','and','or','cap','cup','int','there4','sim','cong','asymp','ne','equiv','le','ge','sub','sup','nsub','sube','supe','oplus','otimes','perp','sdot','lceil','rceil','lfloor','rfloor','lang','rang','loz','spades','clubs','hearts','diams'); use strict; use Getopt::Long; my ($dos, $mac, $multi, $help, $version, $verbose, $tty, $filename, $output) = ''; my $prog_version = '0.7.8'; # Handle command line arguments GetOptions('dos' => \$dos, 'mac' => \$mac, 'multi' => \$multi, 'tty' => \$tty, 'help' => \$help, 'verbose|ver' => \$verbose, 'version|v' => \$version) || die "usage: htx [-h|-v] [-m|-d] [-ver] [-t] filename [output-filename]\n\n"; version() if ($version); usage() if ($help); die "ERROR: Cannot select DOS and Mac text files simultaneously.\n\n" if ($dos && $mac); die "ERROR: No filename specified.\n\n" if (@ARGV == 0); # Populate data arrays my @empty_tags = qw/area base basefont br col frame hr img input isindex link meta param/; my @min_attr = qw/compact checked declare readonly disabled selected defer ismap nohref noshade nowrap multiple noresize/; my @old_doctypes1 = ('', '', ''); my @old_doctypes2 = ('', '', ''); my @new_doctypes = ('', '', ''); # Process each file in turn foreach my $argument (@ARGV) { $filename = $argument; if ($multi) {$output = $filename;} else {$output = $ARGV[1] || $filename;} if ($multi and -d $filename) { opendir (DIR, $filename) or die "ERROR: Unbale to read directory: $filename.\n\n"; chdir($filename); for(readdir(DIR)){ next if(-d); $filename = $_; $output = $filename; convert(); } closedir DIR; last; } if (!-e $filename) { warn "WARNING: File not found: $filename\n"; next; } if (!-r $filename) { warn "WARNING: Unable to read file: $filename\n"; next; } if (!-T $filename) { warn "WARNING: Cannot process non-text file: $filename\n"; next; } convert(); last if (!$multi); } print "Done.\n\n" if (!$tty); exit; sub convert { print "Opening $filename...\n" if ($verbose); # Load the file and slurp it into a string open (INPUT, "< $filename") or die "ERROR: Unable to read file: $filename.\n\n"; my $string = do { local $/; }; close INPUT; print "Processing $filename...\n" if ($verbose); # Remove CR characters to change from Windows to Linux line breaks $string =~ s/\r//g if ($dos); # Replace CR characters to change from Mac to Linux line breaks $string =~ s/\r/\n/g if ($mac); $string =~ s/ height="26"//g; $string =~ s/width="420"/width="250"/g; $string =~ s/width="150"/width="100"/g; $string =~ s///; $string =~ s/

//; $string =~ s/

My games:<\/b>/

My games:<\/b>/; for($string =~ /\&.{9}/g){ my $amper = $_; my $new_amper = $amper; if($new_amper =~ /;/){ if($new_amper !~ /\&#\d+;/){ my $donothing = 0; for(@entities){ $donothing = 1 if($new_amper =~ /\&$_;/); } $new_amper =~ s/\&/&/g unless($donothing); } } else { $new_amper =~ s/\&/&/g; } $string =~ s/\Q$amper\E/$new_amper/g; } # Warn about old ICRA data if ($verbose) { if ($string =~ m/]+?=.+?>)/gm); foreach my $tag (@tags) { my $new_tag = $tag; # Double quote unquoted alphanumeric attribute values $new_tag =~ s/(\w+?)='([^='"]+?)'([ |>])/$1="$2"$3/g; $new_tag =~ s/(\w+?)=([^\s"']+?)([ |>])/$1="$2"$3/g; # Make chars between < and =", containing no ", lowercase $new_tag =~ s/^(<[^"]+?)="/\L$1\E="/; # Make chars between " and =", containing no ", lowercase $new_tag =~ s/"(\s)([^"]+?)="/"$1\L$2\E="/g; # Make chars between " and >, containing no ", lowercase $new_tag =~ s/("[^"]+?) *?>$/\L$1\E>/; # Make the values of the align, valign and shape properties lowercase $new_tag =~ s/ (v?)align="(.+?)"/ $1align="\L$2\E"/g; $new_tag =~ s/ shape=\"(.+?)"/ shape="\L$1\E"/; $string =~ s/\Q$tag\E/$new_tag/g; } # Make chars between < and >, containing no " and not starting with )/\L$1\E/g; # Add closing slash to empty tags foreach (@empty_tags) { $string =~ s/(<$_.*?)("?) ?>/$1$2 \/>/gs; } $string =~ s# / /># />#g; # Process each and every tag individually, ignoring ones starting with ]+?>)/gm); foreach my $tag (@tags) { my $new_tag = $tag; # Correct attribute minimization foreach (@min_attr) { $new_tag =~ s/ $_([ |>])/ $_="$_"$1/g; } # Make hex colour codes lowercase $new_tag =~ s/(\#[A-Fa-f0-9]{3})/\L$1\E/g; $new_tag =~ s/(\#[A-Fa-f0-9]{6})/\L$1\E/g; # Convert attributes to styles my @styles = (); if($new_tag =~ s/ width="([^"]+)"//){ my $value = $1; my $px = ''; $px = 'px' if($1 !~ /\%$/); push(@styles, "width: $value$px;"); } if($new_tag =~ s/ height="([^"]+)"//){ my $value = $1; my $px = ''; $px = 'px' if($1 !~ /\%$/); push(@styles, "height: $value$px;"); } if($new_tag =~ s/ size="([^"]+)"//){ my $value = $1; my $px = ''; $px = 'px' if($1 !~ /\%$/); push(@styles, "height: $value$px;"); } if($new_tag =~ s/ valign="([^"]+)"//){ my $value = $1; push(@styles, "vertical-align: $value;"); } if($new_tag =~ s/ align="([^"]+)"//){ my $value = $1; push(@styles, "text-align: $value;"); } if($new_tag =~ s/ border="([^"]+)"//){ my $value = $1; $value = 'none' if($value eq 0); push(@styles, "border: $value;"); } my $style = join(' ', @styles); if(@styles){ if($new_tag =~ /style=/){ $new_tag =~ s/style="/style="$style/; } else { $new_tag =~ s/(\/?>)/ style="$style" $1/; } } # Remove link targets $new_tag =~ s/ target="[^"]*"//; $new_tag =~ s/ +/ /g; # Check for unmatched double quotes if (($tag =~ tr/"//) & 1) {warn "WARNING: $tag is potentially invalid\n";} $string =~ s/\Q$tag\E/$new_tag/g; } # Change the name attribute to id in and tags $string=~ s/<(a|map)( |.*?)name="(.*?)"( |>)/<$1$2id="$3"$4/g; # Change the value of the clear attribute to lowercase in
tags $string=~ s///g; # Update doctype or add if missing #for(my $count = 0; $count < 3 ; $count++) { # last if ($string =~ s/$old_doctypes1[$count]/$new_doctypes[$count]/i); # last if ($string =~ s/$old_doctypes2[$count]/$new_doctypes[$count]/i); #} #if (!($string =~ m/$output") or die "ERROR: Unable to write to $output\n\n"; binmode OUTPUT; print OUTPUT $string; close OUTPUT; } } sub version { print "HTX version $prog_version, Copyright (C) 2004-2008 Jamie Cheetham\n"; print "Softham: http://www.softham.co.uk/\n\n"; exit; } sub usage { print << "EOF"; HTML To XHTML Convertor $prog_version, Copyright (C) 2004-2008 Jamie Cheetham Usage: htx [--dos|--mac] [--verbose] [] htx [--dos|--mac] [--verbose] [--multi] [ ...] htx [--help|--version] If the output filename isn't specified in single-file mode, the initial file is overwritten with the updated code. Options: -d --dos Convert line breaks from Windows text files to use in Unix/Linux. -ma --mac Convert line breaks from Mac text files to use in Unix/Linux. -mu --multi Process multiple files simultaneously and overwrite them. -h --help Display this help and exit. -ver --verbose Display extra information while processing. -v --version Display version number and exit. EOF exit; } __END__