lib/CPAN/lib/Locale/Maketext/Lexicon/Gettext.pm
author Colas Nahaboo <colas@nahaboo.net>
Sat, 26 Jan 2008 15:50:53 +0100
changeset 0 414e01d06fd5
permissions -rw-r--r--
RELEASE 4.2.0 freetown
     1 package Locale::Maketext::Lexicon::Gettext;
     2 $Locale::Maketext::Lexicon::Gettext::VERSION = '0.14';
     3 
     4 use strict;
     5 
     6 =head1 NAME
     7 
     8 Locale::Maketext::Lexicon::Gettext - PO and MO file parser for Maketext
     9 
    10 =head1 SYNOPSIS
    11 
    12 Called via B<Locale::Maketext::Lexicon>:
    13 
    14     package Hello::I18N;
    15     use base 'Locale::Maketext';
    16     use Locale::Maketext::Lexicon {
    17         de => [Gettext => 'hello/de.mo'],
    18     };
    19 
    20 Directly calling C<parse()>:
    21 
    22     use Locale::Maketext::Lexicon::Gettext;
    23     my %Lexicon = %{ Locale::Maketext::Lexicon::Gettext->parse(<DATA>) };
    24     __DATA__
    25     #: Hello.pm:10
    26     msgid "Hello, World!"
    27     msgstr "Hallo, Welt!"
    28 
    29     #: Hello.pm:11
    30     msgid "You have %quant(%1,piece) of mail."
    31     msgstr "Sie haben %quant(%1,Poststueck,Poststuecken)."
    32 
    33 =head1 DESCRIPTION
    34 
    35 This module implements a perl-based C<Gettext> parser for
    36 B<Locale::Maketext>. It transforms all C<%1>, C<%2>, <%*>... sequences
    37 to C<[_1]>, C<[_2]>, C<[_*]>, and so on.  It accepts either plain PO
    38 file, or a MO file which will be handled with a pure-perl parser
    39 adapted from Imacat's C<Locale::Maketext::Gettext>.
    40 
    41 Since version 0.03, this module also looks for C<%I<function>(I<args...>)>
    42 in the lexicon strings, and transform it to C<[I<function>,I<args...>]>.
    43 Any C<%1>, C<%2>... sequences inside the I<args> will have their percent
    44 signs (C<%>) replaced by underscores (C<_>).
    45 
    46 The name of I<function> above should begin with a letter or underscore,
    47 followed by any number of alphanumeric characters and/or underscores.
    48 As an exception, the function name may also consist of a single asterisk
    49 (C<*>) or pound sign (C<#>), which are C<Locale::Maketext>'s shorthands
    50 for C<quant> and C<numf>, respectively.
    51 
    52 As an additional feature, this module also parses MIME-header style
    53 metadata specified in the null msgstr (C<"">), and add them to the
    54 C<%Lexicon> with a C<__> prefix.  For example, the example above will
    55 set C<__Content-Type> to C<text/plain; charset=iso8859-1>, without
    56 the newline or the colon.
    57 
    58 Any normal entry that duplicates a metadata entry takes precedence.
    59 Hence, a C<msgid "__Content-Type"> line occurs anywhere should override
    60 the above value.
    61 
    62 =head1 OPTIONS
    63 
    64 =head2 use_fuzzy
    65 
    66 When parsing PO files, fuzzy entries (entries marked with C<#, fuzzy>)
    67 are silently ignored.  If you wish to use fuzzy entries, specify a true
    68 value to the C<_use_fuzzy> option:
    69 
    70     use Locale::Maketext::Lexicon {
    71         de => [Gettext => 'hello/de.mo'],
    72         _use_fuzzy => 1,
    73     };
    74 
    75 =head2 allow_empty
    76 
    77 When parsing PO files, empty entries (entries with C<msgstr "">) are
    78 silently ignored.  If you wish to allow empty entries, specify a true
    79 value to the C<_allow_empty> option:
    80 
    81     use Locale::Maketext::Lexicon {
    82         de => [Gettext => 'hello/de.mo'],
    83         _allow_empty => 1,
    84     };
    85 
    86 =cut
    87 
    88 my ($InputEncoding, $OutputEncoding, $DoEncoding);
    89 
    90 sub input_encoding { $InputEncoding };
    91 sub output_encoding { $OutputEncoding };
    92 
    93 sub parse {
    94     my $self = shift;
    95     my (%var, $key, @ret);
    96     my @metadata;
    97 
    98     $InputEncoding = $OutputEncoding = $DoEncoding = undef;
    99 
   100     use Carp;
   101     Carp::cluck "Undefined source called\n" unless defined $_[0];
   102 
   103     # Check for magic string of MO files
   104     return parse_mo(join('', @_))
   105         if ($_[0] =~ /^\x95\x04\x12\xde/ or $_[0] =~ /^\xde\x12\x04\x95/);
   106 
   107     local $^W;  # no 'uninitialized' warnings, please.
   108 
   109     require Locale::Maketext::Lexicon;
   110     my $UseFuzzy = Locale::Maketext::Lexicon::option('use_fuzzy');
   111     my $AllowEmpty = Locale::Maketext::Lexicon::option('allow_empty');
   112     my $process = sub {
   113             if ( length($var{msgstr}) and ($UseFuzzy or !$var{fuzzy}) ) {
   114                 push @ret, (map transform($_), @var{'msgid', 'msgstr'});
   115             }
   116             elsif ( $AllowEmpty ) {
   117                 push @ret, (transform($var{msgid}), '');
   118             }
   119             push @metadata, parse_metadata($var{msgstr})
   120                 if $var{msgid} eq '';
   121             %var = ();
   122     };
   123 
   124     # Parse PO files
   125     foreach (@_) {
   126         s/[\015\012]*\z//; # fix CRLF issues
   127 
   128         /^(msgid|msgstr) +"(.*)" *$/    ? do {  # leading strings
   129             $var{$1} = $2;
   130             $key = $1;
   131         } :
   132 
   133         /^"(.*)" *$/                    ? do {  # continued strings
   134             $var{$key} .= $1;
   135         } :
   136 
   137         /^#, +(.*) *$/                  ? do {  # control variables
   138             $var{$_} = 1 for split(/,\s+/, $1);
   139         } :
   140 
   141         /^ *$/ && %var                  ? do {  # interpolate string escapes
   142 		$process->($_);
   143         } : ();
   144     }
   145     # do not silently skip last entry
   146     $process->() if keys %var != 0;
   147 
   148     push @ret, map { transform($_) } @var{'msgid', 'msgstr'}
   149         if length $var{msgstr};
   150     push @metadata, parse_metadata($var{msgstr})
   151         if $var{msgid} eq '';
   152 
   153     return {@metadata, @ret};
   154 }
   155 
   156 sub parse_metadata {
   157     return map {
   158         (/^([^\x00-\x1f\x80-\xff :=]+):\s*(.*)$/) ?
   159             ($1 eq 'Content-Type') ? do {
   160                 my $enc = $2;
   161                 if ($enc =~ /\bcharset=\s*([-\w]+)/i) {
   162                     $InputEncoding = $1 || '';
   163                     $OutputEncoding = Locale::Maketext::Lexicon::encoding() || '';
   164                     $InputEncoding = 'utf8' if $InputEncoding =~ /^utf-?8$/i;
   165                     $OutputEncoding = 'utf8' if $OutputEncoding =~ /^utf-?8$/i;
   166                     if ( Locale::Maketext::Lexicon::option('decode') and
   167                         (!$OutputEncoding or $InputEncoding ne $OutputEncoding)) {
   168                         require Encode::compat if $] < 5.007001;
   169                         require Encode;
   170                         $DoEncoding = 1;
   171                     }
   172                 }
   173                 ("__Content-Type", $enc);
   174             } : ("__$1", $2)
   175         : ();
   176     } split(/\r*\n+\r*/, transform(pop));
   177 }
   178 
   179 sub transform {
   180     my $str = shift;
   181 
   182     if ($DoEncoding and $InputEncoding) {
   183         $str = ($InputEncoding eq 'utf8')
   184             ? Encode::decode_utf8($str)
   185             : Encode::decode($InputEncoding, $str)
   186     }
   187 
   188     $str =~ s/\\([0x]..|c?.)/qq{"\\$1"}/eeg;
   189 
   190     if ($DoEncoding and $OutputEncoding) {
   191         $str = ($OutputEncoding eq 'utf8')
   192             ? Encode::encode_utf8($str)
   193             : Encode::encode($OutputEncoding, $str)
   194     }
   195 
   196     $str =~ s/([~\[\]])/~$1/g;
   197     $str =~ s/(?<![%\\])%([A-Za-z#*]\w*)\(([^\)]*)\)/[$1,~~~$2~~~]/g;
   198     $str = join('', map {
   199         /^~~~.*~~~$/ ? unescape(substr($_, 3, -3)) : $_
   200     } split(/(~~~.*?~~~)/, $str));
   201     $str =~ s/(?<![%\\])%(\d+|\*)/\[_$1]/g;
   202 
   203     return $str;
   204 }
   205 
   206 sub unescape {
   207     join(',', map {
   208         /^%(?:\d+|\*)$/ ? ("_" . substr($_, 1)) : $_
   209     } split(/,/, $_[0]));
   210 }
   211 
   212 # This subroutine was derived from Locale::Maketext::Gettext::readmo()
   213 # under the Perl License; the original author is Yi Ma Mao (IMACAT).
   214 sub parse_mo {
   215     my $content = shift;
   216     my $tmpl = (substr($content, 0, 4) eq "\xde\x12\x04\x95") ? 'V' : 'N';
   217 
   218     # Check the MO format revision number
   219     # There is only one revision now: revision 0.
   220     return if unpack($tmpl, substr($content, 4, 4)) > 0;
   221 
   222     my ($num, $offo, $offt);
   223     # Number of strings
   224     $num = unpack $tmpl, substr($content, 8, 4);
   225     # Offset to the beginning of the original strings
   226     $offo = unpack $tmpl, substr($content, 12, 4);
   227     # Offset to the beginning of the translated strings
   228     $offt = unpack $tmpl, substr($content, 16, 4);
   229 
   230     my (@metadata, @ret);
   231     for (0 .. $num - 1) {
   232         my ($len, $off, $stro, $strt);
   233         # The first word is the length of the string
   234         $len = unpack $tmpl, substr($content, $offo+$_*8, 4);
   235         # The second word is the offset of the string
   236         $off = unpack $tmpl, substr($content, $offo+$_*8+4, 4);
   237         # Original string
   238         $stro = substr($content, $off, $len);
   239 
   240         # The first word is the length of the string
   241         $len = unpack $tmpl, substr($content, $offt+$_*8, 4);
   242         # The second word is the offset of the string
   243         $off = unpack $tmpl, substr($content, $offt+$_*8+4, 4);
   244         # Translated string
   245         $strt = substr($content, $off, $len);
   246 
   247         # Hash it
   248         push @metadata, parse_metadata($strt) if $stro eq '';
   249         push @ret, (map transform($_), $stro, $strt) if length $strt;
   250     }
   251 
   252     return {@metadata, @ret};
   253 }
   254 
   255 1;
   256 
   257 =head1 SEE ALSO
   258 
   259 L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
   260 
   261 =head1 AUTHORS
   262 
   263 Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>
   264 
   265 =head1 COPYRIGHT
   266 
   267 Copyright 2002, 2003, 2004 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.
   268 
   269 This program is free software; you can redistribute it and/or 
   270 modify it under the same terms as Perl itself.
   271 
   272 See L<http://www.perl.com/perl/misc/Artistic.html>
   273 
   274 =cut