#!/usr/bin/perl -w
=pod
HTX v0.8 - Hhtml To Xhtml Convertor
Copyright (C) 2004-2008 Jamie Cheetham
Email: jamie at softham.co.uk
##############################################################
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##############################################################
For further information see the README file.
=cut
use strict;
use Getopt::Long;
use HTML::PullParser;
use File::Spec;
my ($win, $mac, $multi, $help, $tty, $dir, $version, $verbose, $filename, $output);
my $prog_version = '0.8.0-pre14';
# Handle command line arguments
GetOptions('win' => \$win,
'mac' => \$mac,
'multi' => \$multi,
'help' => \$help,
'tty|t' => \$tty,
'dir|d=s' => \$dir,
'verbose|ver' => \$verbose,
'version|v' => \$version) || die "usage: htx [-h|-v] [-m|-w] [-ver] [-t] filename [output-filename]\n\n";
version() if ($version);
usage() if ($help);
die "ERROR: Cannot select Windows and Mac text files simultaneously.\n\n" if ($win && $mac);
die "ERROR: No filename specified.\n\n" if (@ARGV == 0);
if ($dir) {die "ERROR: Directory not found: $dir\n\n" if (!-e $dir);}
# Populate data arrays
my %lower_attr = qw/align 0 valign 0 clear 0 shape 0/;
my @empty_tags = qw/area base basefont br col frame hr img input isindex link meta param/;
my @min_attr = qw/compact checked declare readonly disabled selected defer ismap nohref noshade nowrap multiple noresize/;
my @old_doctypes1 = ('',
'',
'');
my @old_doctypes2 = ('',
'',
'');
my @new_doctypes = ('',
'',
'');
# Process each file
foreach (@ARGV) {
$filename = $_;
if ($multi) {$output = $filename;} else {$output = $ARGV[1] || $filename;}
if (!-e $filename) {
warn "WARNING: File not found: $filename\n";
next;
}
if (!-r $filename) {
warn "WARNING: Unable to read file: $filename\n";
next;
}
if (!-T $filename) {
warn "WARNING: Cannot process non-text file: $filename\n";
next;
}
convert();
last if (!$multi);
}
print "Done.\n\n" if (!$tty);
exit;
sub convert {
# Load the file and slurp it into a string
open (INPUT, "< $filename") or die "ERROR: Unable to read file: $filename.\n\n";
my $string = do {local $/; };
close INPUT;
print "Processing $filename...\n" if ($verbose);
my $doc = HTML::PullParser->new( doc => \$string,
start => '"S", tagname, @attr',
end => '"E", tagname') or die "ERROR: Can't process: $!\n\n";
my ($start, $end, $empty) = (0,0,0);
# Warn about old ICRA data
if ($verbose) {
if ($string =~ m/]+?>)/gm);
foreach (@tags) {
my $new_tag = $_;
# Correct attribute minimisation
foreach (@min_attr) {$new_tag =~ s/ $_([ >])/ $_="$_"$1/gi;}
if ($new_tag =~ / /) {
my @attr = ($new_tag =~ m/ ([^="\s]+)[ >\/]/);
foreach (@attr) {warn "WARNING: $_ is an invalid minimised attribute.\n" if ($verbose);}
}
# Double quote alphanumeric attribute values
$new_tag =~ s/(\w+?)='([^\s']+?)'([ |>])/$1="$2"$3/g;
$new_tag =~ s/(\w+?)=([^\s"']+?)([ |>])/$1="$2"$3/g;
# Check for unmatched double quotes
if ((($_ =~ tr/"//) & 1) && $verbose) {warn "WARNING: $_ is potentially invalid.\n";}
$string =~ s/\Q$_\E/$new_tag/i;
}
# Process each tag with HTML::PullParser
$string =~ s/([{|}])/\\$1/g;
while (my $token = $doc->get_token) {
# Make tag name lowercase
my $new_token = '<';
if ($token->[0] eq "S") {$start++;}
if ($token->[0] eq "E") {$new_token .= '/'; $end++;}
$new_token .= "$token->[1]";
# Process each attribute individually and make appropriate values lowercase
for (my $x=2; $x[$x] && ($token->[$x] ne "/")) {
$token->[$x+1] =~ s/(\#[A-Fa-f0-9]{6})/\L$1\E/gi;
$token->[$x+1] =~ s/(\#[A-Fa-f0-9]{3})/\L$1\E/gi;
if (exists $lower_attr{$token->[$x]}) {$token->[$x+1] = "\L$token->[$x+1]\E";}
$new_token .= " $token->[$x]=\"$token->[$x+1]\"";
}
}
$new_token .= '>';
my $sub = $new_token;
$sub =~ s/ /(\\s+)/g;
$new_token =~ s/"/\\"/g;
$new_token = '"'.$new_token.'"';
my $i = 1;
while ($new_token =~ / /) {
$new_token =~ s/ /\$$i/;
$i++;
}
# Change name attribute to id in and