lib/CPAN/lib/Locale/Maketext/Lexicon/Gettext.pm
changeset 0 414e01d06fd5
equal deleted inserted replaced
-1:000000000000 0:414e01d06fd5
       
     1 package Locale::Maketext::Lexicon::Gettext;
       
     2 $Locale::Maketext::Lexicon::Gettext::VERSION = '0.14';
       
     3 
       
     4 use strict;
       
     5 
       
     6 =head1 NAME
       
     7 
       
     8 Locale::Maketext::Lexicon::Gettext - PO and MO file parser for Maketext
       
     9 
       
    10 =head1 SYNOPSIS
       
    11 
       
    12 Called via B<Locale::Maketext::Lexicon>:
       
    13 
       
    14     package Hello::I18N;
       
    15     use base 'Locale::Maketext';
       
    16     use Locale::Maketext::Lexicon {
       
    17         de => [Gettext => 'hello/de.mo'],
       
    18     };
       
    19 
       
    20 Directly calling C<parse()>:
       
    21 
       
    22     use Locale::Maketext::Lexicon::Gettext;
       
    23     my %Lexicon = %{ Locale::Maketext::Lexicon::Gettext->parse(<DATA>) };
       
    24     __DATA__
       
    25     #: Hello.pm:10
       
    26     msgid "Hello, World!"
       
    27     msgstr "Hallo, Welt!"
       
    28 
       
    29     #: Hello.pm:11
       
    30     msgid "You have %quant(%1,piece) of mail."
       
    31     msgstr "Sie haben %quant(%1,Poststueck,Poststuecken)."
       
    32 
       
    33 =head1 DESCRIPTION
       
    34 
       
    35 This module implements a perl-based C<Gettext> parser for
       
    36 B<Locale::Maketext>. It transforms all C<%1>, C<%2>, <%*>... sequences
       
    37 to C<[_1]>, C<[_2]>, C<[_*]>, and so on.  It accepts either plain PO
       
    38 file, or a MO file which will be handled with a pure-perl parser
       
    39 adapted from Imacat's C<Locale::Maketext::Gettext>.
       
    40 
       
    41 Since version 0.03, this module also looks for C<%I<function>(I<args...>)>
       
    42 in the lexicon strings, and transform it to C<[I<function>,I<args...>]>.
       
    43 Any C<%1>, C<%2>... sequences inside the I<args> will have their percent
       
    44 signs (C<%>) replaced by underscores (C<_>).
       
    45 
       
    46 The name of I<function> above should begin with a letter or underscore,
       
    47 followed by any number of alphanumeric characters and/or underscores.
       
    48 As an exception, the function name may also consist of a single asterisk
       
    49 (C<*>) or pound sign (C<#>), which are C<Locale::Maketext>'s shorthands
       
    50 for C<quant> and C<numf>, respectively.
       
    51 
       
    52 As an additional feature, this module also parses MIME-header style
       
    53 metadata specified in the null msgstr (C<"">), and add them to the
       
    54 C<%Lexicon> with a C<__> prefix.  For example, the example above will
       
    55 set C<__Content-Type> to C<text/plain; charset=iso8859-1>, without
       
    56 the newline or the colon.
       
    57 
       
    58 Any normal entry that duplicates a metadata entry takes precedence.
       
    59 Hence, a C<msgid "__Content-Type"> line occurs anywhere should override
       
    60 the above value.
       
    61 
       
    62 =head1 OPTIONS
       
    63 
       
    64 =head2 use_fuzzy
       
    65 
       
    66 When parsing PO files, fuzzy entries (entries marked with C<#, fuzzy>)
       
    67 are silently ignored.  If you wish to use fuzzy entries, specify a true
       
    68 value to the C<_use_fuzzy> option:
       
    69 
       
    70     use Locale::Maketext::Lexicon {
       
    71         de => [Gettext => 'hello/de.mo'],
       
    72         _use_fuzzy => 1,
       
    73     };
       
    74 
       
    75 =head2 allow_empty
       
    76 
       
    77 When parsing PO files, empty entries (entries with C<msgstr "">) are
       
    78 silently ignored.  If you wish to allow empty entries, specify a true
       
    79 value to the C<_allow_empty> option:
       
    80 
       
    81     use Locale::Maketext::Lexicon {
       
    82         de => [Gettext => 'hello/de.mo'],
       
    83         _allow_empty => 1,
       
    84     };
       
    85 
       
    86 =cut
       
    87 
       
    88 my ($InputEncoding, $OutputEncoding, $DoEncoding);
       
    89 
       
    90 sub input_encoding { $InputEncoding };
       
    91 sub output_encoding { $OutputEncoding };
       
    92 
       
    93 sub parse {
       
    94     my $self = shift;
       
    95     my (%var, $key, @ret);
       
    96     my @metadata;
       
    97 
       
    98     $InputEncoding = $OutputEncoding = $DoEncoding = undef;
       
    99 
       
   100     use Carp;
       
   101     Carp::cluck "Undefined source called\n" unless defined $_[0];
       
   102 
       
   103     # Check for magic string of MO files
       
   104     return parse_mo(join('', @_))
       
   105         if ($_[0] =~ /^\x95\x04\x12\xde/ or $_[0] =~ /^\xde\x12\x04\x95/);
       
   106 
       
   107     local $^W;  # no 'uninitialized' warnings, please.
       
   108 
       
   109     require Locale::Maketext::Lexicon;
       
   110     my $UseFuzzy = Locale::Maketext::Lexicon::option('use_fuzzy');
       
   111     my $AllowEmpty = Locale::Maketext::Lexicon::option('allow_empty');
       
   112     my $process = sub {
       
   113             if ( length($var{msgstr}) and ($UseFuzzy or !$var{fuzzy}) ) {
       
   114                 push @ret, (map transform($_), @var{'msgid', 'msgstr'});
       
   115             }
       
   116             elsif ( $AllowEmpty ) {
       
   117                 push @ret, (transform($var{msgid}), '');
       
   118             }
       
   119             push @metadata, parse_metadata($var{msgstr})
       
   120                 if $var{msgid} eq '';
       
   121             %var = ();
       
   122     };
       
   123 
       
   124     # Parse PO files
       
   125     foreach (@_) {
       
   126         s/[\015\012]*\z//; # fix CRLF issues
       
   127 
       
   128         /^(msgid|msgstr) +"(.*)" *$/    ? do {  # leading strings
       
   129             $var{$1} = $2;
       
   130             $key = $1;
       
   131         } :
       
   132 
       
   133         /^"(.*)" *$/                    ? do {  # continued strings
       
   134             $var{$key} .= $1;
       
   135         } :
       
   136 
       
   137         /^#, +(.*) *$/                  ? do {  # control variables
       
   138             $var{$_} = 1 for split(/,\s+/, $1);
       
   139         } :
       
   140 
       
   141         /^ *$/ && %var                  ? do {  # interpolate string escapes
       
   142 		$process->($_);
       
   143         } : ();
       
   144     }
       
   145     # do not silently skip last entry
       
   146     $process->() if keys %var != 0;
       
   147 
       
   148     push @ret, map { transform($_) } @var{'msgid', 'msgstr'}
       
   149         if length $var{msgstr};
       
   150     push @metadata, parse_metadata($var{msgstr})
       
   151         if $var{msgid} eq '';
       
   152 
       
   153     return {@metadata, @ret};
       
   154 }
       
   155 
       
   156 sub parse_metadata {
       
   157     return map {
       
   158         (/^([^\x00-\x1f\x80-\xff :=]+):\s*(.*)$/) ?
       
   159             ($1 eq 'Content-Type') ? do {
       
   160                 my $enc = $2;
       
   161                 if ($enc =~ /\bcharset=\s*([-\w]+)/i) {
       
   162                     $InputEncoding = $1 || '';
       
   163                     $OutputEncoding = Locale::Maketext::Lexicon::encoding() || '';
       
   164                     $InputEncoding = 'utf8' if $InputEncoding =~ /^utf-?8$/i;
       
   165                     $OutputEncoding = 'utf8' if $OutputEncoding =~ /^utf-?8$/i;
       
   166                     if ( Locale::Maketext::Lexicon::option('decode') and
       
   167                         (!$OutputEncoding or $InputEncoding ne $OutputEncoding)) {
       
   168                         require Encode::compat if $] < 5.007001;
       
   169                         require Encode;
       
   170                         $DoEncoding = 1;
       
   171                     }
       
   172                 }
       
   173                 ("__Content-Type", $enc);
       
   174             } : ("__$1", $2)
       
   175         : ();
       
   176     } split(/\r*\n+\r*/, transform(pop));
       
   177 }
       
   178 
       
   179 sub transform {
       
   180     my $str = shift;
       
   181 
       
   182     if ($DoEncoding and $InputEncoding) {
       
   183         $str = ($InputEncoding eq 'utf8')
       
   184             ? Encode::decode_utf8($str)
       
   185             : Encode::decode($InputEncoding, $str)
       
   186     }
       
   187 
       
   188     $str =~ s/\\([0x]..|c?.)/qq{"\\$1"}/eeg;
       
   189 
       
   190     if ($DoEncoding and $OutputEncoding) {
       
   191         $str = ($OutputEncoding eq 'utf8')
       
   192             ? Encode::encode_utf8($str)
       
   193             : Encode::encode($OutputEncoding, $str)
       
   194     }
       
   195 
       
   196     $str =~ s/([~\[\]])/~$1/g;
       
   197     $str =~ s/(?<![%\\])%([A-Za-z#*]\w*)\(([^\)]*)\)/[$1,~~~$2~~~]/g;
       
   198     $str = join('', map {
       
   199         /^~~~.*~~~$/ ? unescape(substr($_, 3, -3)) : $_
       
   200     } split(/(~~~.*?~~~)/, $str));
       
   201     $str =~ s/(?<![%\\])%(\d+|\*)/\[_$1]/g;
       
   202 
       
   203     return $str;
       
   204 }
       
   205 
       
   206 sub unescape {
       
   207     join(',', map {
       
   208         /^%(?:\d+|\*)$/ ? ("_" . substr($_, 1)) : $_
       
   209     } split(/,/, $_[0]));
       
   210 }
       
   211 
       
   212 # This subroutine was derived from Locale::Maketext::Gettext::readmo()
       
   213 # under the Perl License; the original author is Yi Ma Mao (IMACAT).
       
   214 sub parse_mo {
       
   215     my $content = shift;
       
   216     my $tmpl = (substr($content, 0, 4) eq "\xde\x12\x04\x95") ? 'V' : 'N';
       
   217 
       
   218     # Check the MO format revision number
       
   219     # There is only one revision now: revision 0.
       
   220     return if unpack($tmpl, substr($content, 4, 4)) > 0;
       
   221 
       
   222     my ($num, $offo, $offt);
       
   223     # Number of strings
       
   224     $num = unpack $tmpl, substr($content, 8, 4);
       
   225     # Offset to the beginning of the original strings
       
   226     $offo = unpack $tmpl, substr($content, 12, 4);
       
   227     # Offset to the beginning of the translated strings
       
   228     $offt = unpack $tmpl, substr($content, 16, 4);
       
   229 
       
   230     my (@metadata, @ret);
       
   231     for (0 .. $num - 1) {
       
   232         my ($len, $off, $stro, $strt);
       
   233         # The first word is the length of the string
       
   234         $len = unpack $tmpl, substr($content, $offo+$_*8, 4);
       
   235         # The second word is the offset of the string
       
   236         $off = unpack $tmpl, substr($content, $offo+$_*8+4, 4);
       
   237         # Original string
       
   238         $stro = substr($content, $off, $len);
       
   239 
       
   240         # The first word is the length of the string
       
   241         $len = unpack $tmpl, substr($content, $offt+$_*8, 4);
       
   242         # The second word is the offset of the string
       
   243         $off = unpack $tmpl, substr($content, $offt+$_*8+4, 4);
       
   244         # Translated string
       
   245         $strt = substr($content, $off, $len);
       
   246 
       
   247         # Hash it
       
   248         push @metadata, parse_metadata($strt) if $stro eq '';
       
   249         push @ret, (map transform($_), $stro, $strt) if length $strt;
       
   250     }
       
   251 
       
   252     return {@metadata, @ret};
       
   253 }
       
   254 
       
   255 1;
       
   256 
       
   257 =head1 SEE ALSO
       
   258 
       
   259 L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
       
   260 
       
   261 =head1 AUTHORS
       
   262 
       
   263 Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>
       
   264 
       
   265 =head1 COPYRIGHT
       
   266 
       
   267 Copyright 2002, 2003, 2004 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.
       
   268 
       
   269 This program is free software; you can redistribute it and/or 
       
   270 modify it under the same terms as Perl itself.
       
   271 
       
   272 See L<http://www.perl.com/perl/misc/Artistic.html>
       
   273 
       
   274 =cut