lib/CPAN/lib/Locale/Maketext/Lexicon/Gettext.pm
author Colas Nahaboo <colas@nahaboo.net>
Sat, 26 Jan 2008 15:50:53 +0100
changeset 0 414e01d06fd5
permissions -rw-r--r--
RELEASE 4.2.0 freetown
colas@0
     1
package Locale::Maketext::Lexicon::Gettext;
colas@0
     2
$Locale::Maketext::Lexicon::Gettext::VERSION = '0.14';
colas@0
     3
colas@0
     4
use strict;
colas@0
     5
colas@0
     6
=head1 NAME
colas@0
     7
colas@0
     8
Locale::Maketext::Lexicon::Gettext - PO and MO file parser for Maketext
colas@0
     9
colas@0
    10
=head1 SYNOPSIS
colas@0
    11
colas@0
    12
Called via B<Locale::Maketext::Lexicon>:
colas@0
    13
colas@0
    14
    package Hello::I18N;
colas@0
    15
    use base 'Locale::Maketext';
colas@0
    16
    use Locale::Maketext::Lexicon {
colas@0
    17
        de => [Gettext => 'hello/de.mo'],
colas@0
    18
    };
colas@0
    19
colas@0
    20
Directly calling C<parse()>:
colas@0
    21
colas@0
    22
    use Locale::Maketext::Lexicon::Gettext;
colas@0
    23
    my %Lexicon = %{ Locale::Maketext::Lexicon::Gettext->parse(<DATA>) };
colas@0
    24
    __DATA__
colas@0
    25
    #: Hello.pm:10
colas@0
    26
    msgid "Hello, World!"
colas@0
    27
    msgstr "Hallo, Welt!"
colas@0
    28
colas@0
    29
    #: Hello.pm:11
colas@0
    30
    msgid "You have %quant(%1,piece) of mail."
colas@0
    31
    msgstr "Sie haben %quant(%1,Poststueck,Poststuecken)."
colas@0
    32
colas@0
    33
=head1 DESCRIPTION
colas@0
    34
colas@0
    35
This module implements a perl-based C<Gettext> parser for
colas@0
    36
B<Locale::Maketext>. It transforms all C<%1>, C<%2>, <%*>... sequences
colas@0
    37
to C<[_1]>, C<[_2]>, C<[_*]>, and so on.  It accepts either plain PO
colas@0
    38
file, or a MO file which will be handled with a pure-perl parser
colas@0
    39
adapted from Imacat's C<Locale::Maketext::Gettext>.
colas@0
    40
colas@0
    41
Since version 0.03, this module also looks for C<%I<function>(I<args...>)>
colas@0
    42
in the lexicon strings, and transform it to C<[I<function>,I<args...>]>.
colas@0
    43
Any C<%1>, C<%2>... sequences inside the I<args> will have their percent
colas@0
    44
signs (C<%>) replaced by underscores (C<_>).
colas@0
    45
colas@0
    46
The name of I<function> above should begin with a letter or underscore,
colas@0
    47
followed by any number of alphanumeric characters and/or underscores.
colas@0
    48
As an exception, the function name may also consist of a single asterisk
colas@0
    49
(C<*>) or pound sign (C<#>), which are C<Locale::Maketext>'s shorthands
colas@0
    50
for C<quant> and C<numf>, respectively.
colas@0
    51
colas@0
    52
As an additional feature, this module also parses MIME-header style
colas@0
    53
metadata specified in the null msgstr (C<"">), and add them to the
colas@0
    54
C<%Lexicon> with a C<__> prefix.  For example, the example above will
colas@0
    55
set C<__Content-Type> to C<text/plain; charset=iso8859-1>, without
colas@0
    56
the newline or the colon.
colas@0
    57
colas@0
    58
Any normal entry that duplicates a metadata entry takes precedence.
colas@0
    59
Hence, a C<msgid "__Content-Type"> line occurs anywhere should override
colas@0
    60
the above value.
colas@0
    61
colas@0
    62
=head1 OPTIONS
colas@0
    63
colas@0
    64
=head2 use_fuzzy
colas@0
    65
colas@0
    66
When parsing PO files, fuzzy entries (entries marked with C<#, fuzzy>)
colas@0
    67
are silently ignored.  If you wish to use fuzzy entries, specify a true
colas@0
    68
value to the C<_use_fuzzy> option:
colas@0
    69
colas@0
    70
    use Locale::Maketext::Lexicon {
colas@0
    71
        de => [Gettext => 'hello/de.mo'],
colas@0
    72
        _use_fuzzy => 1,
colas@0
    73
    };
colas@0
    74
colas@0
    75
=head2 allow_empty
colas@0
    76
colas@0
    77
When parsing PO files, empty entries (entries with C<msgstr "">) are
colas@0
    78
silently ignored.  If you wish to allow empty entries, specify a true
colas@0
    79
value to the C<_allow_empty> option:
colas@0
    80
colas@0
    81
    use Locale::Maketext::Lexicon {
colas@0
    82
        de => [Gettext => 'hello/de.mo'],
colas@0
    83
        _allow_empty => 1,
colas@0
    84
    };
colas@0
    85
colas@0
    86
=cut
colas@0
    87
colas@0
    88
my ($InputEncoding, $OutputEncoding, $DoEncoding);
colas@0
    89
colas@0
    90
sub input_encoding { $InputEncoding };
colas@0
    91
sub output_encoding { $OutputEncoding };
colas@0
    92
colas@0
    93
sub parse {
colas@0
    94
    my $self = shift;
colas@0
    95
    my (%var, $key, @ret);
colas@0
    96
    my @metadata;
colas@0
    97
colas@0
    98
    $InputEncoding = $OutputEncoding = $DoEncoding = undef;
colas@0
    99
colas@0
   100
    use Carp;
colas@0
   101
    Carp::cluck "Undefined source called\n" unless defined $_[0];
colas@0
   102
colas@0
   103
    # Check for magic string of MO files
colas@0
   104
    return parse_mo(join('', @_))
colas@0
   105
        if ($_[0] =~ /^\x95\x04\x12\xde/ or $_[0] =~ /^\xde\x12\x04\x95/);
colas@0
   106
colas@0
   107
    local $^W;  # no 'uninitialized' warnings, please.
colas@0
   108
colas@0
   109
    require Locale::Maketext::Lexicon;
colas@0
   110
    my $UseFuzzy = Locale::Maketext::Lexicon::option('use_fuzzy');
colas@0
   111
    my $AllowEmpty = Locale::Maketext::Lexicon::option('allow_empty');
colas@0
   112
    my $process = sub {
colas@0
   113
            if ( length($var{msgstr}) and ($UseFuzzy or !$var{fuzzy}) ) {
colas@0
   114
                push @ret, (map transform($_), @var{'msgid', 'msgstr'});
colas@0
   115
            }
colas@0
   116
            elsif ( $AllowEmpty ) {
colas@0
   117
                push @ret, (transform($var{msgid}), '');
colas@0
   118
            }
colas@0
   119
            push @metadata, parse_metadata($var{msgstr})
colas@0
   120
                if $var{msgid} eq '';
colas@0
   121
            %var = ();
colas@0
   122
    };
colas@0
   123
colas@0
   124
    # Parse PO files
colas@0
   125
    foreach (@_) {
colas@0
   126
        s/[\015\012]*\z//; # fix CRLF issues
colas@0
   127
colas@0
   128
        /^(msgid|msgstr) +"(.*)" *$/    ? do {  # leading strings
colas@0
   129
            $var{$1} = $2;
colas@0
   130
            $key = $1;
colas@0
   131
        } :
colas@0
   132
colas@0
   133
        /^"(.*)" *$/                    ? do {  # continued strings
colas@0
   134
            $var{$key} .= $1;
colas@0
   135
        } :
colas@0
   136
colas@0
   137
        /^#, +(.*) *$/                  ? do {  # control variables
colas@0
   138
            $var{$_} = 1 for split(/,\s+/, $1);
colas@0
   139
        } :
colas@0
   140
colas@0
   141
        /^ *$/ && %var                  ? do {  # interpolate string escapes
colas@0
   142
		$process->($_);
colas@0
   143
        } : ();
colas@0
   144
    }
colas@0
   145
    # do not silently skip last entry
colas@0
   146
    $process->() if keys %var != 0;
colas@0
   147
colas@0
   148
    push @ret, map { transform($_) } @var{'msgid', 'msgstr'}
colas@0
   149
        if length $var{msgstr};
colas@0
   150
    push @metadata, parse_metadata($var{msgstr})
colas@0
   151
        if $var{msgid} eq '';
colas@0
   152
colas@0
   153
    return {@metadata, @ret};
colas@0
   154
}
colas@0
   155
colas@0
   156
sub parse_metadata {
colas@0
   157
    return map {
colas@0
   158
        (/^([^\x00-\x1f\x80-\xff :=]+):\s*(.*)$/) ?
colas@0
   159
            ($1 eq 'Content-Type') ? do {
colas@0
   160
                my $enc = $2;
colas@0
   161
                if ($enc =~ /\bcharset=\s*([-\w]+)/i) {
colas@0
   162
                    $InputEncoding = $1 || '';
colas@0
   163
                    $OutputEncoding = Locale::Maketext::Lexicon::encoding() || '';
colas@0
   164
                    $InputEncoding = 'utf8' if $InputEncoding =~ /^utf-?8$/i;
colas@0
   165
                    $OutputEncoding = 'utf8' if $OutputEncoding =~ /^utf-?8$/i;
colas@0
   166
                    if ( Locale::Maketext::Lexicon::option('decode') and
colas@0
   167
                        (!$OutputEncoding or $InputEncoding ne $OutputEncoding)) {
colas@0
   168
                        require Encode::compat if $] < 5.007001;
colas@0
   169
                        require Encode;
colas@0
   170
                        $DoEncoding = 1;
colas@0
   171
                    }
colas@0
   172
                }
colas@0
   173
                ("__Content-Type", $enc);
colas@0
   174
            } : ("__$1", $2)
colas@0
   175
        : ();
colas@0
   176
    } split(/\r*\n+\r*/, transform(pop));
colas@0
   177
}
colas@0
   178
colas@0
   179
sub transform {
colas@0
   180
    my $str = shift;
colas@0
   181
colas@0
   182
    if ($DoEncoding and $InputEncoding) {
colas@0
   183
        $str = ($InputEncoding eq 'utf8')
colas@0
   184
            ? Encode::decode_utf8($str)
colas@0
   185
            : Encode::decode($InputEncoding, $str)
colas@0
   186
    }
colas@0
   187
colas@0
   188
    $str =~ s/\\([0x]..|c?.)/qq{"\\$1"}/eeg;
colas@0
   189
colas@0
   190
    if ($DoEncoding and $OutputEncoding) {
colas@0
   191
        $str = ($OutputEncoding eq 'utf8')
colas@0
   192
            ? Encode::encode_utf8($str)
colas@0
   193
            : Encode::encode($OutputEncoding, $str)
colas@0
   194
    }
colas@0
   195
colas@0
   196
    $str =~ s/([~\[\]])/~$1/g;
colas@0
   197
    $str =~ s/(?<![%\\])%([A-Za-z#*]\w*)\(([^\)]*)\)/[$1,~~~$2~~~]/g;
colas@0
   198
    $str = join('', map {
colas@0
   199
        /^~~~.*~~~$/ ? unescape(substr($_, 3, -3)) : $_
colas@0
   200
    } split(/(~~~.*?~~~)/, $str));
colas@0
   201
    $str =~ s/(?<![%\\])%(\d+|\*)/\[_$1]/g;
colas@0
   202
colas@0
   203
    return $str;
colas@0
   204
}
colas@0
   205
colas@0
   206
sub unescape {
colas@0
   207
    join(',', map {
colas@0
   208
        /^%(?:\d+|\*)$/ ? ("_" . substr($_, 1)) : $_
colas@0
   209
    } split(/,/, $_[0]));
colas@0
   210
}
colas@0
   211
colas@0
   212
# This subroutine was derived from Locale::Maketext::Gettext::readmo()
colas@0
   213
# under the Perl License; the original author is Yi Ma Mao (IMACAT).
colas@0
   214
sub parse_mo {
colas@0
   215
    my $content = shift;
colas@0
   216
    my $tmpl = (substr($content, 0, 4) eq "\xde\x12\x04\x95") ? 'V' : 'N';
colas@0
   217
colas@0
   218
    # Check the MO format revision number
colas@0
   219
    # There is only one revision now: revision 0.
colas@0
   220
    return if unpack($tmpl, substr($content, 4, 4)) > 0;
colas@0
   221
colas@0
   222
    my ($num, $offo, $offt);
colas@0
   223
    # Number of strings
colas@0
   224
    $num = unpack $tmpl, substr($content, 8, 4);
colas@0
   225
    # Offset to the beginning of the original strings
colas@0
   226
    $offo = unpack $tmpl, substr($content, 12, 4);
colas@0
   227
    # Offset to the beginning of the translated strings
colas@0
   228
    $offt = unpack $tmpl, substr($content, 16, 4);
colas@0
   229
colas@0
   230
    my (@metadata, @ret);
colas@0
   231
    for (0 .. $num - 1) {
colas@0
   232
        my ($len, $off, $stro, $strt);
colas@0
   233
        # The first word is the length of the string
colas@0
   234
        $len = unpack $tmpl, substr($content, $offo+$_*8, 4);
colas@0
   235
        # The second word is the offset of the string
colas@0
   236
        $off = unpack $tmpl, substr($content, $offo+$_*8+4, 4);
colas@0
   237
        # Original string
colas@0
   238
        $stro = substr($content, $off, $len);
colas@0
   239
colas@0
   240
        # The first word is the length of the string
colas@0
   241
        $len = unpack $tmpl, substr($content, $offt+$_*8, 4);
colas@0
   242
        # The second word is the offset of the string
colas@0
   243
        $off = unpack $tmpl, substr($content, $offt+$_*8+4, 4);
colas@0
   244
        # Translated string
colas@0
   245
        $strt = substr($content, $off, $len);
colas@0
   246
colas@0
   247
        # Hash it
colas@0
   248
        push @metadata, parse_metadata($strt) if $stro eq '';
colas@0
   249
        push @ret, (map transform($_), $stro, $strt) if length $strt;
colas@0
   250
    }
colas@0
   251
colas@0
   252
    return {@metadata, @ret};
colas@0
   253
}
colas@0
   254
colas@0
   255
1;
colas@0
   256
colas@0
   257
=head1 SEE ALSO
colas@0
   258
colas@0
   259
L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
colas@0
   260
colas@0
   261
=head1 AUTHORS
colas@0
   262
colas@0
   263
Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>
colas@0
   264
colas@0
   265
=head1 COPYRIGHT
colas@0
   266
colas@0
   267
Copyright 2002, 2003, 2004 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.
colas@0
   268
colas@0
   269
This program is free software; you can redistribute it and/or 
colas@0
   270
modify it under the same terms as Perl itself.
colas@0
   271
colas@0
   272
See L<http://www.perl.com/perl/misc/Artistic.html>
colas@0
   273
colas@0
   274
=cut