#!/usr/bin/perl -w
=pod
HTX v0.7 - Hhtml To Xhtml Convertor
Copyright (C) 2004-2005 Jamie Cheetham
Email: jamie at softham.co.uk
##############################################################
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##############################################################
For further information see the README file.
=cut
use strict;
use Getopt::Long;
my ($dos, $mac, $multi, $help, $version, $verbose, $tty, $filename, $output) = '';
my $prog_version = '0.7.7';
# Handle command line arguments
GetOptions('dos' => \$dos,
'mac' => \$mac,
'multi' => \$multi,
'tty' => \$tty,
'help' => \$help,
'verbose|ver' => \$verbose,
'version|v' => \$version) || die "usage: htx [-h|-v] [-m|-d] [-ver] [-t] filename [output-filename]\n\n";
version() if ($version);
usage() if ($help);
die "ERROR: Cannot select DOS and Mac text files simultaneously.\n\n" if ($dos && $mac);
die "ERROR: No filename specified.\n\n" if (@ARGV == 0);
# Populate data arrays
my @empty_tags = qw/area base basefont br col frame hr img input isindex link meta param/;
my @min_attr = qw/compact checked declare readonly disabled selected defer ismap nohref noshade nowrap multiple noresize/;
my @old_doctypes1 = ('',
'',
'');
my @old_doctypes2 = ('',
'',
'');
my @new_doctypes = ('',
'',
'');
# Process each file in turn
foreach my $argument (@ARGV) {
$filename = $argument;
if ($multi) {$output = $filename;} else {$output = $ARGV[1] || $filename;}
if (!-e $filename) {
warn "WARNING: File not found: $filename\n";
next;
}
if (!-r $filename) {
warn "WARNING: Unable to read file: $filename\n";
next;
}
if (!-T $filename) {
warn "WARNING: Cannot process non-text file: $filename\n";
next;
}
convert();
last if (!$multi);
}
print "Done.\n\n" if (!tty);
exit;
sub convert {
print "Opening $filename...\n" if ($verbose);
# Load the file and slurp it into a string
open (INPUT, "< $filename") or die "ERROR: Unable to read file: $filename.\n\n";
my $string = do { local $/; };
close INPUT;
print "Processing $filename...\n" if ($verbose);
# Remove CR characters to change from Windows to Linux line breaks
$string =~ s/\r//g if ($dos);
# Replace CR characters to change from Mac to Linux line breaks
$string =~ s/\r/\n/g if ($mac);
# Warn about old ICRA data
if ($verbose) {
if ($string =~ m/]+?=.+?>)/gm);
foreach my $tag (@tags) {
my $new_tag = $tag;
# Double quote unquoted alphanumeric attribute values
$new_tag =~ s/(\w+?)='([^='"]+?)'([ |>])/$1="$2"$3/g;
$new_tag =~ s/(\w+?)=([^\s"]+?)([ |>])/$1="$2"$3/g;
# Make chars between < and =", containing no ", lowercase
$new_tag =~ s/^(<[^"]+?)="/\L$1\E="/;
# Make chars between " and =", containing no ", lowercase
$new_tag =~ s/"(\s)([^"]+?)="/"$1\L$2\E="/g;
# Make chars between " and >, containing no ", lowercase
$new_tag =~ s/("[^"]+?) *?>$/\L$1\E>/;
# Make the values of the align, valign and shape properties lowercase
$new_tag =~ s/ (v?)align="(.+?)"/ $1align="\L$2\E"/g;
$new_tag =~ s/ shape=\"(.+?)"/ shape="\L$1\E"/;
$string =~ s/\Q$tag\E/$new_tag/g;
}
# Make chars between < and >, containing no " and not starting with )/\L$1\E/g;
# Add closing slash to empty tags
foreach (@empty_tags) {
$string =~ s/(<$_.*?)("?) ?>/$1$2 \/>/gs;
}
$string =~ s# / /># />#g;
# Process each and every tag individually, ignoring ones starting with or ]+?>)/gm);
foreach my $tag (@tags) {
my $new_tag = $tag;
# Correct attribute minimization
foreach (@min_attr) {
$new_tag =~ s/ $_([ |>])/ $_="$_"$1/g;
}
# Make hex colour codes lowercase
$new_tag =~ s/(\#[A-Fa-f0-9]{3})/\L$1\E/g;
$new_tag =~ s/(\#[A-Fa-f0-9]{6})/\L$1\E/g;
# Check for unmatched double quotes
if (($tag =~ tr/"//) & 1) {warn "WARNING: $tag is potentially invalid\n";}
$string =~ s/\Q$tag\E/$new_tag/g;
}
# Change the name attribute to id in and