#!/usr/bin/perl -w =pod HTX v0.7 - Hhtml To Xhtml Convertor Copyright (C) 2004-2005 Jamie Cheetham Email: jamie at softham.co.uk ############################################################## This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ############################################################## For further information see the README file. =cut use strict; use Getopt::Long; my ($dos, $mac, $multi, $help, $version, $verbose, $tty, $filename, $output) = ''; my $prog_version = '0.7.7'; # Handle command line arguments GetOptions('dos' => \$dos, 'mac' => \$mac, 'multi' => \$multi, 'tty' => \$tty, 'help' => \$help, 'verbose|ver' => \$verbose, 'version|v' => \$version) || die "usage: htx [-h|-v] [-m|-d] [-ver] [-t] filename [output-filename]\n\n"; version() if ($version); usage() if ($help); die "ERROR: Cannot select DOS and Mac text files simultaneously.\n\n" if ($dos && $mac); die "ERROR: No filename specified.\n\n" if (@ARGV == 0); # Populate data arrays my @empty_tags = qw/area base basefont br col frame hr img input isindex link meta param/; my @min_attr = qw/compact checked declare readonly disabled selected defer ismap nohref noshade nowrap multiple noresize/; my @old_doctypes1 = ('', '', ''); my @old_doctypes2 = ('', '', ''); my @new_doctypes = ('', '', ''); # Process each file in turn foreach my $argument (@ARGV) { $filename = $argument; if ($multi) {$output = $filename;} else {$output = $ARGV[1] || $filename;} if (!-e $filename) { warn "WARNING: File not found: $filename\n"; next; } if (!-r $filename) { warn "WARNING: Unable to read file: $filename\n"; next; } if (!-T $filename) { warn "WARNING: Cannot process non-text file: $filename\n"; next; } convert(); last if (!$multi); } print "Done.\n\n" if (!tty); exit; sub convert { print "Opening $filename...\n" if ($verbose); # Load the file and slurp it into a string open (INPUT, "< $filename") or die "ERROR: Unable to read file: $filename.\n\n"; my $string = do { local $/; }; close INPUT; print "Processing $filename...\n" if ($verbose); # Remove CR characters to change from Windows to Linux line breaks $string =~ s/\r//g if ($dos); # Replace CR characters to change from Mac to Linux line breaks $string =~ s/\r/\n/g if ($mac); # Warn about old ICRA data if ($verbose) { if ($string =~ m/]+?=.+?>)/gm); foreach my $tag (@tags) { my $new_tag = $tag; # Double quote unquoted alphanumeric attribute values $new_tag =~ s/(\w+?)='([^='"]+?)'([ |>])/$1="$2"$3/g; $new_tag =~ s/(\w+?)=([^\s"]+?)([ |>])/$1="$2"$3/g; # Make chars between < and =", containing no ", lowercase $new_tag =~ s/^(<[^"]+?)="/\L$1\E="/; # Make chars between " and =", containing no ", lowercase $new_tag =~ s/"(\s)([^"]+?)="/"$1\L$2\E="/g; # Make chars between " and >, containing no ", lowercase $new_tag =~ s/("[^"]+?) *?>$/\L$1\E>/; # Make the values of the align, valign and shape properties lowercase $new_tag =~ s/ (v?)align="(.+?)"/ $1align="\L$2\E"/g; $new_tag =~ s/ shape=\"(.+?)"/ shape="\L$1\E"/; $string =~ s/\Q$tag\E/$new_tag/g; } # Make chars between < and >, containing no " and not starting with )/\L$1\E/g; # Add closing slash to empty tags foreach (@empty_tags) { $string =~ s/(<$_.*?)("?) ?>/$1$2 \/>/gs; } $string =~ s# / /># />#g; # Process each and every tag individually, ignoring ones starting with ]+?>)/gm); foreach my $tag (@tags) { my $new_tag = $tag; # Correct attribute minimization foreach (@min_attr) { $new_tag =~ s/ $_([ |>])/ $_="$_"$1/g; } # Make hex colour codes lowercase $new_tag =~ s/(\#[A-Fa-f0-9]{3})/\L$1\E/g; $new_tag =~ s/(\#[A-Fa-f0-9]{6})/\L$1\E/g; # Check for unmatched double quotes if (($tag =~ tr/"//) & 1) {warn "WARNING: $tag is potentially invalid\n";} $string =~ s/\Q$tag\E/$new_tag/g; } # Change the name attribute to id in and tags $string=~ s/<(a|map)( |.*?)name="(.*?)"( |>)/<$1$2id="$3"$4/g; # Change the value of the clear attribute to lowercase in
tags $string=~ s///g; # Update doctype or add if missing for(my $count = 0; $count < 3 ; $count++) { last if ($string =~ s/$old_doctypes1[$count]/$new_doctypes[$count]/i); last if ($string =~ s/$old_doctypes2[$count]/$new_doctypes[$count]/i); } if (!($string =~ m/$output") or die "ERROR: Unable to write to $output\n\n"; binmode OUTPUT; print OUTPUT $string; close OUTPUT; } } sub version { print "HTX version $prog_version, Copyright (C) 2004-2005 Jamie Cheetham\n"; print "Softham: http://www.softham.co.uk/\n\n"; exit; } sub usage { print << "EOF"; HTML To XHTML Convertor $prog_version, Copyright (C) 2004-2005 Jamie Cheetham Usage: htx [--dos|--mac] [--verbose] [] htx [--dos|--mac] [--verbose] [--multi] [ ...] htx [--help|--version] If the output filename isn't specified in single-file mode, the initial file is overwritten with the updated code. Options: -d --dos Convert line breaks from Windows text files to use in Unix/Linux. -ma --mac Convert line breaks from Mac text files to use in Unix/Linux. -mu --multi Process multiple files simultaneously and overwrite them. -h --help Display this help and exit. -ver --verbose Display extra information while processing. -v --version Display version number and exit. EOF exit; } __END__