#!/usr/local/bin/perl
# html+tables v0.4 written by Brooks Cutter (bcutter@paradyne.com)
#   - creates preformatted text tables from HTML+ Table definitions.
#
#   - Japanese version 95/08/29(a1n@gfdl.gov)
#       --- avoid kanji-splitting
#
# This program will parse a HTML+ Document replace HTML+ Tables
# (<tbl>...</tbl>) with preformatted text tables.
# For more information on the specific details of HTML+ Tables see:
# http://info.cern.ch/pub/www/dev/htmlplus.dtd
# http://info.cern.ch/hypertext/WWW/Markup/htmlplus.dtd.txt
# http://info.cern.ch/hypertext/WWW/Markup/htmlplus.ps
#
# This program isn't intended to be used as a gateway - because
# of the overhead of continually parsing the documents (and the
# lack of security (ie: checking pathnames)) - but instead documents
# should be processed once with this program, like:
#
# html+tables file.html+ > file.html
#
# This program is a temporary measure - Until HTML+ Table support
# shows up in X/Mosaic, WinMosaic, Lynx, etc... (It should also
# be useful for anyone who puts together the "HTML to printed book"
# package)
#
# Note on HTML+ Compliance:
#
# This program implements most, but not all of the HTML+ table options.
# Furthermore, certain parts of the spec were vague (or more likely I
# didn't read all the information out there) such as how to handle
# wordwrapping.  The point is that while I hoped that when HTML+ tables
# appear in NCSA's X/Mosaic and Win/Mosaic, their behavior may be different
# then the output of this script..
#
# What it does and doesn't do:
#
# - preformatted ascii tables are surrouned with <pre> and </pre>
# - <tbl>..</tbl> is a table without borders
# - <tbl border>..</tbl> is a table with borders
# - <tt> and <tt top> put the title at the top of the table
# - <tt bottom> puts the table at the bottom
# - <th> and <td> are table header and table data - but they are treated
#   *EXACTLY* the same.  Also none of their options are supported 
#   like align=(left,center,right) or rowspan or colspan..
# - <tr> terminates a row.  It isn't necessary right before the </tbl> tag.
# - Empty rows are deleted, but empty columns aren't
# - It ignores CR's in the input stream - presumably the HTML way..
# - It will size the tables (and wrap the text in cells) according to the
#   length in $terminal_width 
# - when text is wrapped it attempts to break the lines on a space, a comma
# - and a period, in that order.  If it's a space, it doesn't show up
#   on the next line
# - the tags <a href>...</a> are supported, but <b>..</b> and <i>..</i>
#   aren't because of a bug in the viewers with enhanced preformatted
#   text.
# - when wordwrapping text with <a>..</a>, if it breaks the text in the
#   middle of the hyperlinked text, the hyperlink is continued to the
#   next line.
# - I think it goes without saying that <img src> isn't supported.. 
#   (Even if I could find the gif at conversion time to check the
#    size, I don't know what size the preformatted text will be rendered
#    in.  Besides, I don't need it...)
#
# Known bugs:
#
# - table may not be resized to _exactly_ $terminal_width..
# - When wrapping occurs in a table cell, after all wrapping is
#   completed there may be more than one space between the longest string
#   in table cell.
# - May be a few other small bugs with cell formatting
#
# Limitations:
#
# - a one character wide column requires 5 characters to display:
#   two for horizontal lines, two spaces and 1 character of display.
#   therefore, on a 80 column table the most columns you can have is
#   16 (80/5) columns.  The program will exit with a error message if you
#   exceed $terminal_width/5 columns
#
# Changes in v0.4: (first release)
# - fixed bugs in formatting lines with HTML tags
#
# Here are some examples of the output of html+tables:
# ----------------------------------------------------------------------
# Table #1: A simple table with a border, title at the top, 3 rows
# and 4 columns.
# ----------------------------------------------------------------------
# <tbl border>
# <tt top>An Example of a Table
# <th> <th>average<th>other<tr>
# <th> height <th> width <th> category<tr>
# <th> males<td>1.9<td>.003<td>yyy<tr>
# <th>females<td>1.7<td>.002<td>xxx<tr>
# <th><td><td>
# </tbl>
# ----------------------------------------------------------------------
# <pre>
#        An Example of a Table
# +---------+---------+----------+-----+
# |         | average | other    |     |
# |---------|---------|----------|-----|
# | height  | width   | category |     |
# |---------|---------|----------|-----|
# | males   | 1.9     | .003     | yyy |
# |---------|---------|----------|-----|
# | females | 1.7     | .002     | xxx |
# +---------+---------+----------+-----+
# </pre>

# Set this to the maximum width of the tables
# (Note due to a possible bug the real value may be a few more than
# this value - so 70 might output 70-75 column tables. Type: "rough int")
$terminal_width = 70;
# This hack is used in wordwrapping lines with HTML tags in them.
# it tells it to what percent it should try to fill the cells with.
# If it can't wordwrap (space,comma,period) it to this value or less
$fill_percent = '0.50'; # Valid values 0.1 - 1.0
# Info on this program
$program_ver = '0.4';
$program_contact = 'Brooks Cutter (bcutter@paradyne.com)';


require 'ctime.pl'; # Need this perl library
chop($date_n_time = &ctime(time));

print "<!-- The tables in this document were created on $date_n_time -->\n";
print "<!-- Tables created with HTML+Tables v$program_ver by $program_contact -->\n";

@links = &parse_html(<>);

$ctr = -1;
$x = $y = 1;
$in_table = 0;
while (defined $links[++$ctr]) {
  if (($links[$ctr] =~ /^\s*$/) && (!$in_table)) { 
    print $links[$ctr];
    next;
  }
  $data = $links[$ctr];
  $data =~ s/^\s+//; # Delete leading space
  $data =~ s/\s+$//; # Delete trailing space
  $data =~ tr/\r\n//d; # Delete linefeed, Carriage Return
  if ((!$in_table) && ($data =~ m!^<tbl\s*(.*)>$!i)) {
    $in_table = 1; 
    $table_opts = $1;
    $table_border = 1 if ($table_opts =~ /border/i);
    next;
  }
  if (($in_table) && ($data =~ m!^</tbl!i)) {
    $in_table = 0;
    &print_table($table_title,*table);
    $table_title = '';
    %table = ();
    undef @col_len;
    $x = $y = 1;
    next;
  }
  if ($in_table) {
    if ($data =~ /^<tt\s*(\S*)>$/i) { # Table Title
      $loc = $1;
      $loc =~ tr/A-Z/a-z/;
      $ctr++;
      $data = $links[$ctr];
      $data =~ s/^\s+//; # Delete leading space
      $data =~ s/\s+$//; # Delete trailing space
      $data =~ tr/\x0c\x0a//d; # Delete linefeed, Carriage Return
      $table_title = $data;
      $table_bottom = 1 if ($loc eq 'bottom');
      next;
    }
    if ($data =~ /^<\s*t[dh]/i) {
    #if ($data =~ /^<t[dhr]/i) {
      # Retrieve until next <td> or <tr> or <th>
      $col_len = 0;
      while ((defined $links[++$ctr]) && (!($links[$ctr] =~ /^<\s*t[rhd]/i))) {
        $data = $links[$ctr];
        next if ($data =~ /^\s*$/);
        $data =~ s/[\r\n]+/ /g; # Replace linefeed, Carriage Return
        $data =~ s/^\s+//; # Delete leading space
        $data =~ s/\s+$//; # Delete trailing space
				# If you want to try <b>/<i> change this line here
				# (and the length/substr below)
        next if ((substr($data,0,1) eq '<') 
          && (substr($data,0,2) ne '<a')
          && (substr($data,0,3) ne '</a'));
        # These are temp - for including <>'s
        unless (defined $table{$y,$x}) {
          $table{$y,$x} = $data;
        } else {
          # If current link is a tag, or previous link is the beginning of
          # a href tag, then don't prefix it with space...
          if ((substr($data,0,1) eq '<') 
          || (substr($links[$ctr-1],0,2) eq '<a')) {
            $table{$y,$x} .= $data;
          } else {
            $table{$y,$x} .= " $data";
            $col_len++; # To account for preceeding space
          }
        }
        $col_len += length($data) unless 
          ((substr($data,0,2) eq '<a') || (substr($data,0,3) eq '</a'));
      }
      $col_len[$x] = $col_len if ($col_len[$x] <= $col_len);
      #$table_len{$y,$x} = $col_len;
      if ($links[$ctr] =~ /^<tr/i) { # Next Table Row
        $y++; $x = 1;
        next; # Next Row
      }
      $x++;
      $ctr--;
      next;
    }
    next;
  } else {
    print $links[$ctr];
  }
}
&print_table($table_title,*table);
$table_title = '';

#
exit;
sub print_table {
  local($title) = shift(@_);
  local(*table) = shift(@_);
  local($row,$row_len,$maxlen,$maxrow,$x,$y,$fmt,$val,$line,$num_rows,$eval,$_);
  local($col);
  return if ($#col_len == -1);
  local($next_line,$emptyrow,@empty);
  #local($terminal_width) = 65;
  &abort_table($#col_len+1) if (int($terminal_width/5) < ($#col_len+1));
  $row = 1;
  $row_len = 0;
  $maxlen = 0;
  $maxrow = 0;
  $emptyrow = 1;
  for (sort sort_table keys %table) {
    ($y,$x) = split(/$;/,$_,2);
  
    if ($y != $row) {
      # For last row...
      $empty[$row] = $emptyrow;
      $row_len += 2; # space and border
      $maxlen = $row_len if ($row_len > $maxlen);
      # New Row
      $row_len = 0;
      $row = $y;
      $num_rows++;
      $emptyrow = 1;
    }
    $row_len += 2 + $col_len[$x];
  
    if(length $table{$y,$x}) {
      $emptyrow = 0;
    }
  }
  # Last row
  $row_len += 2; # space and border
  $maxlen = $row_len if ($row_len > $maxlen);
  $num_rows++;
  $empty[$row] = $emptyrow;
  
  local($tbl_vline,$tbl_hline,$tbl_intersection);
  if ($table_border) {
    ($tbl_vline,$tbl_hline,$tbl_intersection) = ('|','-','+');
  } else {
    ($tbl_vline,$tbl_hline,$tbl_intersection) = (' ',' ',' ');
  }
  $next_line = $fmt = $tbl_vline;
  $line = $tbl_intersection;

  $val = '';
  &size_to_win($terminal_width,*col_len,*maxlen);
  for $col (1..$#col_len) {
    $fmt .= " %s $tbl_vline";
    $line .= $tbl_hline x ($col_len[$col]+2) . $tbl_intersection;
    $next_line .= $tbl_hline x ($col_len[$col]+2) . $tbl_vline;
  }
  $next_line .= "\n";
  $line .= "\n";
  
  print "<pre>\n";
  if (($title) && (!$table_bottom)) {
    print 
    &center(($maxlen < $terminal_width ? $maxlen : $terminal_width),$title),
    "\n";
  }

  $first = 1;
  print $line;
  local(@data);
  for $y (1..$num_rows) { # Row at a time
    next if ($empty[$y]);
    print $next_line unless($first); $first = 0;
    do {
      @data = ();
      $loop = 0;
      for $x (1..$#col_len) {
        #if (defined $table{$y,$x}) {
        if (length $table{$y,$x}) {
          ($data[$x],$therest) = &url_substr($table{$y,$x},$col_len[$x]);
#          ($data[$x],$therest) = &url_substr($table{$y,$x},$col_len[$x]+1);
          if (length $therest) { $table{$y,$x} = $therest; $loop = 1; }
          else { delete $table{$y,$x}; }
        } else {
          $data[$x] = &pad($table{$y,$x},$col_len[$x]);
          delete $table{$y,$x};
        }
      }
      shift @data; # drop data[0]
      $eval = 'printf("'.$fmt. '\n",@data);';
      eval $eval;
      die "error eval'ing '$eval': $@\n" if ($@);
    } while ($loop);
  }
  print $line;
  if (($title) && ($table_bottom)) {
    print 
    &center(($maxlen < $terminal_width ? $maxlen : $terminal_width),$title),
    "\n";
  }
  print "</pre>\n";
}

exit;

sub sort_table {
  local($a_y,$a_x) = split(/$;/,$a);
  local($b_y,$b_x) = split(/$;/,$b);

  return($a_y <=> $b_y) if ($a_y != $b_y);
  return($a_x <=> $b_x);
}

sub parse_html {
  local(@data) = ();
  local($save);
  NEXTLINE: for (@_) {
    #$save .= "$_ ";
    $save .= $_;
    if ((($lt = index($save,'<')) == -1) || (index($save,'>',$lt) == -1))
      { next; }
    $lt = $gt = 0;
    while (($lt = index($save, '<', $gt)) >= $[) {
      # This is the data *BEFORE* the '<'
      if ($lt) { # do If isn't /^</
        if ($gt) {
          $data = substr($save, ($gt+1), ($lt-$gt-1));
        } else {
          $data = substr($save, ($gt), ($lt-$gt));
        }
        push(@data, $data);
      }
      $gt = index($save, '>', $lt);
      if ($gt == -1) {
        $save = substr($save, $lt);
        next NEXTLINE;
      }
      # This is the data *INSIDE* the <>
      $data = substr($save, $lt, ($gt-$lt+1));
      push(@data, $data);
    }
    $save = substr($save, ($gt+1));
  }
  push(@data, $save);
  return(@data);
}

sub center {
  local($width) = shift(@_);
  local($string) = shift(@_);
  local($length) = length($string);
  local($pad) = (($width - $length)/2);
  return(' ' x $pad . $string);
}

sub numerically {$a <=> $b;}

sub size_to_win {
  local($term_width) = shift(@_);
  local(*col_len) = shift(@_);
  local(*maxlen) = shift(@_);
  local($col,%col_len2col,@col_len_sort);
  local($long,$next_long);
  for $col (1..$#col_len) { 
    if (length $col_len2col{$col_len[$col]}) {
      $col_len2col{$col_len[$col]} .= ",$col";
    } else {
      $col_len2col{$col_len[$col]} = $col;
    }
  }
  while ($maxlen > $term_width) {
    local(@cols) = ();
    local($decrease_by) = 0;
    @col_len_sort = reverse sort numerically @col_len;
    $long = shift(@col_len_sort);
    $next_long = shift(@col_len_sort);
    $col = $col_len2col{$long};
    if (index($col_len2col{$long},',') != -1) {
      @cols = split(/,/,$col_len2col{$long});
      $col = shift(@cols);
      $col_len2col{$long} = join(',',@cols);
    } else {
      $col = $col_len2col{$long};
      delete $col_len2col{$long};
    }
    if (($maxlen - ($long - $next_long) + 1) < $term_width) {
      # Would actually be smaller than window instead of exact...
      $decrease_by = ($maxlen - $term_width);
      $col_len[$col] -= $decrease_by;
      $maxlen -= $decrease_by;
    } else {
      $decrease_by = ($long - $next_long + 1);
      $col_len[$col] -= $decrease_by;
      $maxlen -= $decrease_by;
    }
    if (length $col_len2col{$col_len[$col]}) {
      $col_len2col{$col_len[$col]} .= ",$col";
    } else {
      $col_len2col{$col_len[$col]} = $col;
    }
  }
}

sub url_substr {
  local($str) = shift(@_);
  local($maxlen) = shift(@_);
  local($space_len) = shift(@_) || $maxlen;
  local($segment,$therest);
  local($term,$tag,$tag);
  local($curlen) = 0;

  # These are used to detect <html>tags</html>
  local($lt_ndx) = rindex($str,'<',$maxlen);
  local($gt_ndx) = rindex($str,'>',$maxlen);
  if (($lt_ndx == -1) && ($gt_ndx == -1)) {
  # No tags in string at all
    local($delim,$ndx,$ndx2);
    $curlen = length($str);
    $str =~ s/^\s*//; $str =~ s/\s*$//; 
    if (length($str) < $maxlen) {
      $segment = $str;
			$str = '';
      $segment = &pad($segment,$space_len);
      return($segment,undef);
    } else {
#     $delim = ' '; $ndx = rindex($str,' ',$maxlen); $ndx2 = $ndx+1;
      $ndx=-1;
      if ($ndx == -1) 
        { $delim = ')'; $ndx=rindex($str,$delim,$maxlen-1); $ndx2 = $ndx+1; }
      if ($ndx == -1) 
	{ $delim = '('; $ndx=rindex($str,$delim,$maxlen-1); $ndx2 = $ndx; }
      if ($ndx == -1) 
        { $delim = ' '; $ndx=rindex($str,' ',$maxlen-1)   ; $ndx2 = $ndx+1; }
      if ($ndx == -1) 
        { $delim = ','; $ndx=rindex($str,$delim,$maxlen-1); $ndx2 = $ndx+1; }
      if ($ndx == -1) 
        { $delim = '.'; $ndx=rindex($str,$delim,$maxlen-1); $ndx2 = $ndx+1; }
      $ndx = -1 if ($ndx == 0);
      if ($ndx == -1) {
	local(@tmp) = split(//,$str); local(@tmp2)=(); local($i);
	for($i=0;$i<$maxlen;$i++) {
	   if((@tmp[0] !~ /[ -~]/) && ++$i<$maxlen){ push(@tmp2,shift(@tmp)); }
           if($i<$maxlen) { push(@tmp2,shift(@tmp)); }
        }
	$segment = join("",@tmp2);
	$therest = join("",@tmp);
##	$segment = substr($str,0,$maxlen);
##      $therest = substr($str,$maxlen);
        $curlen = length($segment);
      } else {
        $segment = substr($str,0,$ndx2);
        $curlen = length($segment);
        $therest = substr($str,$ndx2);
        $therest =~ s/^ //;
      }
    }
  } else {
    # Break into links for easier handling
    local(@links) = &parse_html($str); 
  
    local($link_ctr) = -1;
    while (defined $links[++$link_ctr]) {
      next if ($links[$link_ctr] =~ /^\s*$/);
      if ($links[$link_ctr] =~ m!^<\s*(/)?\s*(.+)\s*>$!) {
        $term = ($1 eq '/');
        $tag = $2;
        if ($tag =~ /^\s*([abi])/i) { # end tag must be there.
          local($tag_c) = $1;
          # Pray for no nested tags (bug in waiting)
          if ($term) {
            $segment .= $end_tag;
            $begin_tag = $end_tag = '';
          } else {
            $begin_tag = $links[$link_ctr];
            $end_tag = "</$tag_c>";
            $segment .= $links[$link_ctr];
          }
        } 
      } else {
        if (($curlen >= $maxlen) || ($curlen >= int($fill_percent * $maxlen))) {
          if ($begin_tag) {
            $segment .= $end_tag;
            $therest = $begin_tag . join('',@links[$link_ctr..$#links]);
          } else {
            $therest = join('',@links[$link_ctr..$#links]);
          }
          last;
        }

        $link_length = length($links[$link_ctr]);
        if (($curlen+$link_length) > $maxlen) {
          local($delim,$ndx,$ndx2);
          $delim = ' '; 
          $ndx = rindex($links[$link_ctr],$delim,($maxlen - $curlen));
          $ndx2 = $ndx +1; 
          if ($ndx == -1) {
            $delim = ','; 
            $ndx = rindex($links[$link_ctr],$delim,($maxlen - $curlen));
            $ndx2 = $ndx; 
          }
          if ($ndx == -1) {
            $delim = '.'; 
            $ndx = rindex($links[$link_ctr],$delim,($maxlen - $curlen));
            $ndx2 = $ndx;
          }
          $ndx = $maxlen if ($ndx <= 0);
          local($cutlen);
          if ($ndx <= $maxlen) {
            $cutlen = $ndx;
            $cutlen2 = $ndx2;
          } else {
            $cutlen2 = $cutlen = ($maxlen-$curlen);
          } 

          $curlen += $cutlen;
          $segment .= substr($links[$link_ctr],0,$cutlen);
          substr($links[$link_ctr],0,$cutlen2) = '';
          $link_ctr--;
        } else {
          $curlen += $link_length;
          $segment .= $links[$link_ctr];
        }
      }
    }
  }
  if ($curlen < $space_len) {
    $segment .= ' ' x ($space_len - $curlen);
  }
  return($segment,$therest);
}

#sub debug {
#  print STDERR @_,"\n";
#}

sub pad {
  local($v) = substr($_[0],0,$_[1]);
  return($v . ' ' x ($_[1] - length($v)));
}

sub abort_table {
  local($num_col) = shift(@_);
  local($tw5) = int($terminal_width/5);

  print STDERR <<EOF;
  ####    ####   #####   #####    #   #
 #       #    #  #    #  #    #    # #
  ####   #    #  #    #  #    #     #
      #  #    #  #####   #####      #
 #    #  #    #  #   #   #   #      #
  ####    ####   #    #  #    #     #

I was unable to parse one of the tables because there were too many
columns in the table.  In order for this program to run, the following
must be true:

(\$terminal_width/5) < $num_col

The program failed because:

($terminal_width / 5) = $tw5
and
$tw5 >= $num_col

In order to run this, you can either decrease the number of columns in
the table, or increase the value of \$terminal_width which is currently
set to $terminal_width.

The program will now exit.

EOF

  exit;
}
