#!/usr/bin/perl -w =pod HTX v0.8 - Hhtml To Xhtml Convertor Copyright (C) 2004-2008 Jamie Cheetham Email: jamie at softham.co.uk ############################################################## This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ############################################################## For further information see the README file. =cut use strict; use Getopt::Long; use HTML::PullParser; use File::Spec; my ($win, $mac, $multi, $help, $tty, $dir, $version, $verbose, $filename, $output); my $prog_version = '0.8.0-pre14'; # Handle command line arguments GetOptions('win' => \$win, 'mac' => \$mac, 'multi' => \$multi, 'help' => \$help, 'tty|t' => \$tty, 'dir|d=s' => \$dir, 'verbose|ver' => \$verbose, 'version|v' => \$version) || die "usage: htx [-h|-v] [-m|-w] [-ver] [-t] filename [output-filename]\n\n"; version() if ($version); usage() if ($help); die "ERROR: Cannot select Windows and Mac text files simultaneously.\n\n" if ($win && $mac); die "ERROR: No filename specified.\n\n" if (@ARGV == 0); if ($dir) {die "ERROR: Directory not found: $dir\n\n" if (!-e $dir);} # Populate data arrays my %lower_attr = qw/align 0 valign 0 clear 0 shape 0/; my @empty_tags = qw/area base basefont br col frame hr img input isindex link meta param/; my @min_attr = qw/compact checked declare readonly disabled selected defer ismap nohref noshade nowrap multiple noresize/; my @old_doctypes1 = ('', '', ''); my @old_doctypes2 = ('', '', ''); my @new_doctypes = ('', '', ''); # Process each file foreach (@ARGV) { $filename = $_; if ($multi) {$output = $filename;} else {$output = $ARGV[1] || $filename;} if (!-e $filename) { warn "WARNING: File not found: $filename\n"; next; } if (!-r $filename) { warn "WARNING: Unable to read file: $filename\n"; next; } if (!-T $filename) { warn "WARNING: Cannot process non-text file: $filename\n"; next; } convert(); last if (!$multi); } print "Done.\n\n" if (!$tty); exit; sub convert { # Load the file and slurp it into a string open (INPUT, "< $filename") or die "ERROR: Unable to read file: $filename.\n\n"; my $string = do {local $/; }; close INPUT; print "Processing $filename...\n" if ($verbose); my $doc = HTML::PullParser->new( doc => \$string, start => '"S", tagname, @attr', end => '"E", tagname') or die "ERROR: Can't process: $!\n\n"; my ($start, $end, $empty) = (0,0,0); # Warn about old ICRA data if ($verbose) { if ($string =~ m/]+?>)/gm); foreach (@tags) { my $new_tag = $_; # Correct attribute minimisation foreach (@min_attr) {$new_tag =~ s/ $_([ >])/ $_="$_"$1/gi;} if ($new_tag =~ / /) { my @attr = ($new_tag =~ m/ ([^="\s]+)[ >\/]/); foreach (@attr) {warn "WARNING: $_ is an invalid minimised attribute.\n" if ($verbose);} } # Double quote alphanumeric attribute values $new_tag =~ s/(\w+?)='([^\s']+?)'([ |>])/$1="$2"$3/g; $new_tag =~ s/(\w+?)=([^\s"']+?)([ |>])/$1="$2"$3/g; # Check for unmatched double quotes if ((($_ =~ tr/"//) & 1) && $verbose) {warn "WARNING: $_ is potentially invalid.\n";} $string =~ s/\Q$_\E/$new_tag/i; } # Process each tag with HTML::PullParser $string =~ s/([{|}])/\\$1/g; while (my $token = $doc->get_token) { # Make tag name lowercase my $new_token = '<'; if ($token->[0] eq "S") {$start++;} if ($token->[0] eq "E") {$new_token .= '/'; $end++;} $new_token .= "$token->[1]"; # Process each attribute individually and make appropriate values lowercase for (my $x=2; $x[$x] && ($token->[$x] ne "/")) { $token->[$x+1] =~ s/(\#[A-Fa-f0-9]{6})/\L$1\E/gi; $token->[$x+1] =~ s/(\#[A-Fa-f0-9]{3})/\L$1\E/gi; if (exists $lower_attr{$token->[$x]}) {$token->[$x+1] = "\L$token->[$x+1]\E";} $new_token .= " $token->[$x]=\"$token->[$x+1]\""; } } $new_token .= '>'; my $sub = $new_token; $sub =~ s/ /(\\s+)/g; $new_token =~ s/"/\\"/g; $new_token = '"'.$new_token.'"'; my $i = 1; while ($new_token =~ / /) { $new_token =~ s/ /\$$i/; $i++; } # Change name attribute to id in and tags if (((substr($new_token,1,3) eq ')/id=\\"$1\\"$2/g; } $string =~ s/\Q$sub\E/$new_token/iee; } $string =~ s/\\([{|}])/$1/g; # Remove CR characters to change from Windows to Linux line breaks $string =~ s/\r//g if ($win); # Replace CR characters to change from Mac to Linux line breaks $string =~ s/\r/\n/g if ($mac); # Add closing slash to empty tags foreach (@empty_tags) { $empty += ($string =~ s/(<$_.*?)("?) ?>/$1$2 \/>/gs); } $string =~ s# / /># />#g; # Update the HTML tag itself $string =~ s///g; # Update doctype or add if missing for(my $count = 0; $count < 3 ; $count++) { last if ($string =~ s/$old_doctypes1[$count]/$new_doctypes[$count]/i); last if ($string =~ s/$old_doctypes2[$count]/$new_doctypes[$count]/i); } if (!($string =~ m/catfile($dir,$output);} open (OUTPUT, ">$output") or die "ERROR: Unable to write to $output\n\n"; binmode OUTPUT; print OUTPUT $string; close OUTPUT; } } sub version { print "HTX version $prog_version, Copyright (C) 2004-2008 Jamie Cheetham\n"; print "Softham: http://www.softham.co.uk/\n\n"; exit; } sub usage { print << "EOF"; HTX - HTML To XHTML Convertor $prog_version, Copyright (C) 2004-2008 Jamie Cheetham Usage: htx [options] [] htx [options] [--dir ] htx [options] [--dir ] --multi [ ...] htx [--help|--version] If the output filename isn't specified in single-file mode, the initial file is overwritten with the updated code. Options: -w --win Convert line breaks from Windows text files to use in Unix/Linux. -ma --mac Convert line breaks from Mac text files to use in Unix/Linux. -mu --multi Process multiple files simultaneously and overwrite them. -h --help Display this help and exit. -d --dir Specify the output directory. -t --tty Output results to STDOUT, rather than a file. Ignores --dir. -ver --verbose Display extra information while processing. -v --version Display version number and exit. EOF exit; } __END__