lib/TWiki/Plugins/WysiwygPlugin/HTML2TML.pm
author Colas Nahaboo <colas@nahaboo.net>
Sat, 26 Jan 2008 15:50:53 +0100
changeset 0 414e01d06fd5
child 1 e2915a7cbdfa
permissions -rw-r--r--
RELEASE 4.2.0 freetown
colas@0
     1
# Copyright (C) 2005 ILOG http://www.ilog.fr
colas@0
     2
# and TWiki Contributors. All Rights Reserved. TWiki Contributors
colas@0
     3
# are listed in the AUTHORS file in the root of this distribution.
colas@0
     4
# NOTE: Please extend that file, not this notice.
colas@0
     5
#
colas@0
     6
# This program is free software; you can redistribute it and/or
colas@0
     7
# modify it under the terms of the GNU General Public License
colas@0
     8
# as published by the Free Software Foundation; either version 2
colas@0
     9
# of the License, or (at your option) any later version. For
colas@0
    10
# more details read LICENSE in the root of the TWiki distribution.
colas@0
    11
#
colas@0
    12
# This program is distributed in the hope that it will be useful,
colas@0
    13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
colas@0
    14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
colas@0
    15
#
colas@0
    16
# As per the GPL, removal of this notice is prohibited.
colas@0
    17
colas@0
    18
=pod
colas@0
    19
colas@0
    20
---+ package TWiki::Plugins::WysiwygPlugin::HTML2TML;
colas@0
    21
colas@0
    22
Convertor for translating HTML into TML (TWiki Meta Language)
colas@0
    23
colas@0
    24
The conversion is done by parsing the HTML and generating a parse
colas@0
    25
tree, and then converting that parse treeinto TML.
colas@0
    26
colas@0
    27
The class is a subclass of HTML::Parser, run in XML mode, so it
colas@0
    28
should be tolerant to many syntax errors, and will also handle
colas@0
    29
XHTML syntax.
colas@0
    30
colas@0
    31
The translator tries hard to make good use of newlines in the
colas@0
    32
HTML, in order to maintain text level formating that isn't
colas@0
    33
reflected in the HTML. So the parser retains newlines and
colas@0
    34
spaces, rather than throwing them away, and uses various
colas@0
    35
heuristics to determine which to keep when generating
colas@0
    36
the final TML.
colas@0
    37
colas@0
    38
=cut
colas@0
    39
colas@0
    40
package TWiki::Plugins::WysiwygPlugin::HTML2TML;
colas@0
    41
use base 'HTML::Parser';
colas@0
    42
colas@0
    43
use strict;
colas@0
    44
colas@0
    45
require TWiki::Plugins::WysiwygPlugin::HTML2TML::Node;
colas@0
    46
require TWiki::Plugins::WysiwygPlugin::HTML2TML::Leaf;
colas@0
    47
require HTML::Parser;
colas@0
    48
colas@0
    49
# Entities that are safe to convert back to 8-bit characters without
colas@0
    50
# tripping over Perl's crappy UTF-8 support.
colas@0
    51
my %safe_entities = (
colas@0
    52
    iexcl  => 161, cent   => 162, pound  => 163,
colas@0
    53
    curren => 164, yen    => 165, brvbar => 166, sect   => 167,
colas@0
    54
    uml    => 168, copy   => 169, ordf   => 170, laquo  => 171,
colas@0
    55
    not    => 172, shy    => 173, reg    => 174, macr   => 175,
colas@0
    56
    deg    => 176, plusmn => 177, sup2   => 178, sup3   => 179,
colas@0
    57
    acute  => 180, micro  => 181, para   => 182, middot => 183,
colas@0
    58
    cedil  => 184, sup1   => 185, ordm   => 186, raquo  => 187,
colas@0
    59
    frac14 => 188, frac12 => 189, frac34 => 190, iquest => 191,
colas@0
    60
    Agrave => 192, Aacute => 193, Acirc  => 194, Atilde => 195,
colas@0
    61
    Auml   => 196, Aring  => 197, AElig  => 198, Ccedil => 199,
colas@0
    62
    Egrave => 200, Eacute => 201, Ecirc  => 202, Euml   => 203,
colas@0
    63
    Igrave => 204, Iacute => 205, Icirc  => 206, Iuml   => 207,
colas@0
    64
    ETH    => 208, Ntilde => 209, Ograve => 210, Oacute => 211,
colas@0
    65
    Ocirc  => 212, Otilde => 213, Ouml   => 214, times  => 215,
colas@0
    66
    Oslash => 216, Ugrave => 217, Uacute => 218, Ucirc  => 219,
colas@0
    67
    Uuml   => 220, Yacute => 221, THORN  => 222, szlig  => 223,
colas@0
    68
    agrave => 224, aacute => 225, acirc  => 226, atilde => 227,
colas@0
    69
    auml   => 228, aring  => 229, aelig  => 230, ccedil => 231,
colas@0
    70
    egrave => 232, eacute => 233, ecirc  => 234, uml    => 235,
colas@0
    71
    igrave => 236, iacute => 237, icirc  => 238, iuml   => 239,
colas@0
    72
    eth    => 240, ntilde => 241, ograve => 242, oacute => 243,
colas@0
    73
    ocirc  => 244, otilde => 245, ouml   => 246, divide => 247,
colas@0
    74
    oslash => 248, ugrave => 249, uacute => 250, ucirc  => 251,
colas@0
    75
    uuml   => 252, yacute => 253, thorn  => 254, yuml   => 255,
colas@0
    76
);
colas@0
    77
colas@0
    78
my $safe_entities_re = join('|', keys %safe_entities);
colas@0
    79
colas@0
    80
=pod
colas@0
    81
colas@0
    82
---++ ClassMethod new()
colas@0
    83
colas@0
    84
Constructs a new HTML to TML convertor.
colas@0
    85
colas@0
    86
You *must* provide parseWikiUrl and convertImage if you want URLs
colas@0
    87
translated back to wikinames. See WysiwygPlugin.pm for an example
colas@0
    88
of how to call it.
colas@0
    89
colas@0
    90
=cut
colas@0
    91
colas@0
    92
sub new {
colas@0
    93
    my( $class ) = @_;
colas@0
    94
colas@0
    95
    my $this = new HTML::Parser( start_h => [\&_openTag, 'self,tagname,attr' ],
colas@0
    96
                                 end_h => [\&_closeTag, 'self,tagname'],
colas@0
    97
                                 declaration_h => [\&_ignore, 'self'],
colas@0
    98
                                 default_h => [\&_text, 'self,text'],
colas@0
    99
                                 comment_h => [\&_comment, 'self,text'] );
colas@0
   100
colas@0
   101
    $this = bless( $this, $class );
colas@0
   102
colas@0
   103
    $this->xml_mode( 1 );
colas@0
   104
    if ($this->can('empty_element_tags')) {
colas@0
   105
        # protected because not there in some HTML::Parser versions
colas@0
   106
        $this->empty_element_tags( 1 );
colas@0
   107
    };
colas@0
   108
    $this->unbroken_text( 1 );
colas@0
   109
colas@0
   110
    return $this;
colas@0
   111
}
colas@0
   112
colas@0
   113
sub _resetStack {
colas@0
   114
    my $this = shift;
colas@0
   115
colas@0
   116
    $this->{stackTop} =
colas@0
   117
      new TWiki::Plugins::WysiwygPlugin::HTML2TML::Node( $this->{opts}, '' );
colas@0
   118
    $this->{stack} = ();
colas@0
   119
}
colas@0
   120
colas@0
   121
=pod
colas@0
   122
colas@0
   123
---++ ObjectMethod convert( $html ) -> $tml
colas@0
   124
colas@0
   125
Convert a block of HTML text into TML.
colas@0
   126
colas@0
   127
=cut
colas@0
   128
colas@0
   129
sub convert {
colas@0
   130
    my( $this, $text, $options ) = @_;
colas@0
   131
colas@0
   132
    $this->{opts} = $options;
colas@0
   133
colas@0
   134
    my $opts = 0;
colas@0
   135
    $opts = $WC::VERY_CLEAN
colas@0
   136
      if ( $options->{very_clean} );
colas@0
   137
colas@0
   138
    # Item5138: Convert 8-bit entities back into characters
colas@0
   139
    $text =~ s/&($safe_entities_re);/chr($safe_entities{$1})/ego;
colas@0
   140
    $text =~ s/(&#(\d+);)/$2 > 127 && $2 <= 255 ? chr($2) : $1/eg;
colas@0
   141
    $text =~ s/(&#x([\dA-Fa-f]+);)/(hex($2) > 127 && hex($2)) <= 255 ? chr(hex($2)) : $1/eg;
colas@0
   142
colas@0
   143
    # get rid of nasties
colas@0
   144
    $text =~ s/\r//g;
colas@0
   145
    $this->_resetStack();
colas@0
   146
    $this->parse( $text );
colas@0
   147
    $this->eof();
colas@0
   148
    #print STDERR "Finished\n";
colas@0
   149
    $this->_apply( undef );
colas@0
   150
    return $this->{stackTop}->rootGenerate( $opts );
colas@0
   151
}
colas@0
   152
colas@0
   153
# Autoclose tags without waiting for a /tag
colas@0
   154
my %autoClose = map { $_ => 1 } qw( area base basefont br col embed frame hr input link meta param );
colas@0
   155
colas@0
   156
# Support auto-close of the tags that are most typically incorrectly
colas@0
   157
# nested. Autoclose triggers when a second tag of the same type is
colas@0
   158
# seen without the first tag being closed.
colas@0
   159
my %closeOnRepeat = map { $_ => 1 } qw( li td th tr );
colas@0
   160
colas@0
   161
sub _openTag {
colas@0
   162
    my( $this, $tag, $attrs ) = @_;
colas@0
   163
colas@0
   164
    $tag = lc($tag);
colas@0
   165
colas@0
   166
    if ($closeOnRepeat{$tag} &&
colas@0
   167
          $this->{stackTop} &&
colas@0
   168
            $this->{stackTop}->{tag} eq $tag) {
colas@0
   169
        #print STDERR "Close on repeat $tag\n";
colas@0
   170
        $this->_apply($tag);
colas@0
   171
    }
colas@0
   172
colas@0
   173
    push( @{$this->{stack}}, $this->{stackTop} ) if $this->{stackTop};
colas@0
   174
    $this->{stackTop} =
colas@0
   175
      new TWiki::Plugins::WysiwygPlugin::HTML2TML::Node(
colas@0
   176
          $this->{opts}, $tag, $attrs );
colas@0
   177
colas@0
   178
    if ($autoClose{$tag}) {
colas@0
   179
        #print STDERR "Autoclose $tag\n";
colas@0
   180
        $this->_apply($tag);
colas@0
   181
    }
colas@0
   182
}
colas@0
   183
colas@0
   184
sub _closeTag {
colas@0
   185
    my( $this, $tag ) = @_;
colas@0
   186
colas@0
   187
    $tag = lc($tag);
colas@0
   188
colas@0
   189
    while ($this->{stackTop} &&
colas@0
   190
             $this->{stackTop}->{tag} ne $tag &&
colas@0
   191
               $autoClose{$this->{stackTop}->{tag}}) {
colas@0
   192
        #print STDERR "Close mismatched $this->{stackTop}->{tag}\n";
colas@0
   193
        $this->_apply($this->{stackTop}->{tag});
colas@0
   194
    }
colas@0
   195
    if ($this->{stackTop} &&
colas@0
   196
          $this->{stackTop}->{tag} eq $tag) {
colas@0
   197
        #print STDERR "Closing $tag\n";
colas@0
   198
        $this->_apply($tag);
colas@0
   199
    }
colas@0
   200
}
colas@0
   201
colas@0
   202
sub _text {
colas@0
   203
    my( $this, $text ) = @_;
colas@0
   204
    my $l = new TWiki::Plugins::WysiwygPlugin::HTML2TML::Leaf( $text );
colas@0
   205
    $this->{stackTop}->addChild( $l );
colas@0
   206
}
colas@0
   207
colas@0
   208
sub _comment {
colas@0
   209
    my( $this, $text ) = @_;
colas@0
   210
    my $l = new TWiki::Plugins::WysiwygPlugin::HTML2TML::Leaf( $text );
colas@0
   211
    $this->{stackTop}->addChild( $l );
colas@0
   212
}
colas@0
   213
colas@0
   214
sub _ignore {
colas@0
   215
}
colas@0
   216
colas@0
   217
sub _apply {
colas@0
   218
    my( $this, $tag ) = @_;
colas@0
   219
colas@0
   220
    while( $this->{stack} && scalar( @{$this->{stack}} )) {
colas@0
   221
        my $top = $this->{stackTop};
colas@0
   222
        #print STDERR "Pop $top->{tag}\n";
colas@0
   223
        $this->{stackTop} = pop( @{$this->{stack}} );
colas@0
   224
        die unless $this->{stackTop};
colas@0
   225
        $this->{stackTop}->addChild( $top );
colas@0
   226
        last if( $tag && $top->{tag} eq $tag );
colas@0
   227
    }
colas@0
   228
}
colas@0
   229
colas@0
   230
1;