lib/CPAN/lib/Locale/Maketext/Lexicon/Gettext.pm
changeset 0 414e01d06fd5
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/lib/CPAN/lib/Locale/Maketext/Lexicon/Gettext.pm	Sat Jan 26 15:50:53 2008 +0100
     1.3 @@ -0,0 +1,274 @@
     1.4 +package Locale::Maketext::Lexicon::Gettext;
     1.5 +$Locale::Maketext::Lexicon::Gettext::VERSION = '0.14';
     1.6 +
     1.7 +use strict;
     1.8 +
     1.9 +=head1 NAME
    1.10 +
    1.11 +Locale::Maketext::Lexicon::Gettext - PO and MO file parser for Maketext
    1.12 +
    1.13 +=head1 SYNOPSIS
    1.14 +
    1.15 +Called via B<Locale::Maketext::Lexicon>:
    1.16 +
    1.17 +    package Hello::I18N;
    1.18 +    use base 'Locale::Maketext';
    1.19 +    use Locale::Maketext::Lexicon {
    1.20 +        de => [Gettext => 'hello/de.mo'],
    1.21 +    };
    1.22 +
    1.23 +Directly calling C<parse()>:
    1.24 +
    1.25 +    use Locale::Maketext::Lexicon::Gettext;
    1.26 +    my %Lexicon = %{ Locale::Maketext::Lexicon::Gettext->parse(<DATA>) };
    1.27 +    __DATA__
    1.28 +    #: Hello.pm:10
    1.29 +    msgid "Hello, World!"
    1.30 +    msgstr "Hallo, Welt!"
    1.31 +
    1.32 +    #: Hello.pm:11
    1.33 +    msgid "You have %quant(%1,piece) of mail."
    1.34 +    msgstr "Sie haben %quant(%1,Poststueck,Poststuecken)."
    1.35 +
    1.36 +=head1 DESCRIPTION
    1.37 +
    1.38 +This module implements a perl-based C<Gettext> parser for
    1.39 +B<Locale::Maketext>. It transforms all C<%1>, C<%2>, <%*>... sequences
    1.40 +to C<[_1]>, C<[_2]>, C<[_*]>, and so on.  It accepts either plain PO
    1.41 +file, or a MO file which will be handled with a pure-perl parser
    1.42 +adapted from Imacat's C<Locale::Maketext::Gettext>.
    1.43 +
    1.44 +Since version 0.03, this module also looks for C<%I<function>(I<args...>)>
    1.45 +in the lexicon strings, and transform it to C<[I<function>,I<args...>]>.
    1.46 +Any C<%1>, C<%2>... sequences inside the I<args> will have their percent
    1.47 +signs (C<%>) replaced by underscores (C<_>).
    1.48 +
    1.49 +The name of I<function> above should begin with a letter or underscore,
    1.50 +followed by any number of alphanumeric characters and/or underscores.
    1.51 +As an exception, the function name may also consist of a single asterisk
    1.52 +(C<*>) or pound sign (C<#>), which are C<Locale::Maketext>'s shorthands
    1.53 +for C<quant> and C<numf>, respectively.
    1.54 +
    1.55 +As an additional feature, this module also parses MIME-header style
    1.56 +metadata specified in the null msgstr (C<"">), and add them to the
    1.57 +C<%Lexicon> with a C<__> prefix.  For example, the example above will
    1.58 +set C<__Content-Type> to C<text/plain; charset=iso8859-1>, without
    1.59 +the newline or the colon.
    1.60 +
    1.61 +Any normal entry that duplicates a metadata entry takes precedence.
    1.62 +Hence, a C<msgid "__Content-Type"> line occurs anywhere should override
    1.63 +the above value.
    1.64 +
    1.65 +=head1 OPTIONS
    1.66 +
    1.67 +=head2 use_fuzzy
    1.68 +
    1.69 +When parsing PO files, fuzzy entries (entries marked with C<#, fuzzy>)
    1.70 +are silently ignored.  If you wish to use fuzzy entries, specify a true
    1.71 +value to the C<_use_fuzzy> option:
    1.72 +
    1.73 +    use Locale::Maketext::Lexicon {
    1.74 +        de => [Gettext => 'hello/de.mo'],
    1.75 +        _use_fuzzy => 1,
    1.76 +    };
    1.77 +
    1.78 +=head2 allow_empty
    1.79 +
    1.80 +When parsing PO files, empty entries (entries with C<msgstr "">) are
    1.81 +silently ignored.  If you wish to allow empty entries, specify a true
    1.82 +value to the C<_allow_empty> option:
    1.83 +
    1.84 +    use Locale::Maketext::Lexicon {
    1.85 +        de => [Gettext => 'hello/de.mo'],
    1.86 +        _allow_empty => 1,
    1.87 +    };
    1.88 +
    1.89 +=cut
    1.90 +
    1.91 +my ($InputEncoding, $OutputEncoding, $DoEncoding);
    1.92 +
    1.93 +sub input_encoding { $InputEncoding };
    1.94 +sub output_encoding { $OutputEncoding };
    1.95 +
    1.96 +sub parse {
    1.97 +    my $self = shift;
    1.98 +    my (%var, $key, @ret);
    1.99 +    my @metadata;
   1.100 +
   1.101 +    $InputEncoding = $OutputEncoding = $DoEncoding = undef;
   1.102 +
   1.103 +    use Carp;
   1.104 +    Carp::cluck "Undefined source called\n" unless defined $_[0];
   1.105 +
   1.106 +    # Check for magic string of MO files
   1.107 +    return parse_mo(join('', @_))
   1.108 +        if ($_[0] =~ /^\x95\x04\x12\xde/ or $_[0] =~ /^\xde\x12\x04\x95/);
   1.109 +
   1.110 +    local $^W;  # no 'uninitialized' warnings, please.
   1.111 +
   1.112 +    require Locale::Maketext::Lexicon;
   1.113 +    my $UseFuzzy = Locale::Maketext::Lexicon::option('use_fuzzy');
   1.114 +    my $AllowEmpty = Locale::Maketext::Lexicon::option('allow_empty');
   1.115 +    my $process = sub {
   1.116 +            if ( length($var{msgstr}) and ($UseFuzzy or !$var{fuzzy}) ) {
   1.117 +                push @ret, (map transform($_), @var{'msgid', 'msgstr'});
   1.118 +            }
   1.119 +            elsif ( $AllowEmpty ) {
   1.120 +                push @ret, (transform($var{msgid}), '');
   1.121 +            }
   1.122 +            push @metadata, parse_metadata($var{msgstr})
   1.123 +                if $var{msgid} eq '';
   1.124 +            %var = ();
   1.125 +    };
   1.126 +
   1.127 +    # Parse PO files
   1.128 +    foreach (@_) {
   1.129 +        s/[\015\012]*\z//; # fix CRLF issues
   1.130 +
   1.131 +        /^(msgid|msgstr) +"(.*)" *$/    ? do {  # leading strings
   1.132 +            $var{$1} = $2;
   1.133 +            $key = $1;
   1.134 +        } :
   1.135 +
   1.136 +        /^"(.*)" *$/                    ? do {  # continued strings
   1.137 +            $var{$key} .= $1;
   1.138 +        } :
   1.139 +
   1.140 +        /^#, +(.*) *$/                  ? do {  # control variables
   1.141 +            $var{$_} = 1 for split(/,\s+/, $1);
   1.142 +        } :
   1.143 +
   1.144 +        /^ *$/ && %var                  ? do {  # interpolate string escapes
   1.145 +		$process->($_);
   1.146 +        } : ();
   1.147 +    }
   1.148 +    # do not silently skip last entry
   1.149 +    $process->() if keys %var != 0;
   1.150 +
   1.151 +    push @ret, map { transform($_) } @var{'msgid', 'msgstr'}
   1.152 +        if length $var{msgstr};
   1.153 +    push @metadata, parse_metadata($var{msgstr})
   1.154 +        if $var{msgid} eq '';
   1.155 +
   1.156 +    return {@metadata, @ret};
   1.157 +}
   1.158 +
   1.159 +sub parse_metadata {
   1.160 +    return map {
   1.161 +        (/^([^\x00-\x1f\x80-\xff :=]+):\s*(.*)$/) ?
   1.162 +            ($1 eq 'Content-Type') ? do {
   1.163 +                my $enc = $2;
   1.164 +                if ($enc =~ /\bcharset=\s*([-\w]+)/i) {
   1.165 +                    $InputEncoding = $1 || '';
   1.166 +                    $OutputEncoding = Locale::Maketext::Lexicon::encoding() || '';
   1.167 +                    $InputEncoding = 'utf8' if $InputEncoding =~ /^utf-?8$/i;
   1.168 +                    $OutputEncoding = 'utf8' if $OutputEncoding =~ /^utf-?8$/i;
   1.169 +                    if ( Locale::Maketext::Lexicon::option('decode') and
   1.170 +                        (!$OutputEncoding or $InputEncoding ne $OutputEncoding)) {
   1.171 +                        require Encode::compat if $] < 5.007001;
   1.172 +                        require Encode;
   1.173 +                        $DoEncoding = 1;
   1.174 +                    }
   1.175 +                }
   1.176 +                ("__Content-Type", $enc);
   1.177 +            } : ("__$1", $2)
   1.178 +        : ();
   1.179 +    } split(/\r*\n+\r*/, transform(pop));
   1.180 +}
   1.181 +
   1.182 +sub transform {
   1.183 +    my $str = shift;
   1.184 +
   1.185 +    if ($DoEncoding and $InputEncoding) {
   1.186 +        $str = ($InputEncoding eq 'utf8')
   1.187 +            ? Encode::decode_utf8($str)
   1.188 +            : Encode::decode($InputEncoding, $str)
   1.189 +    }
   1.190 +
   1.191 +    $str =~ s/\\([0x]..|c?.)/qq{"\\$1"}/eeg;
   1.192 +
   1.193 +    if ($DoEncoding and $OutputEncoding) {
   1.194 +        $str = ($OutputEncoding eq 'utf8')
   1.195 +            ? Encode::encode_utf8($str)
   1.196 +            : Encode::encode($OutputEncoding, $str)
   1.197 +    }
   1.198 +
   1.199 +    $str =~ s/([~\[\]])/~$1/g;
   1.200 +    $str =~ s/(?<![%\\])%([A-Za-z#*]\w*)\(([^\)]*)\)/[$1,~~~$2~~~]/g;
   1.201 +    $str = join('', map {
   1.202 +        /^~~~.*~~~$/ ? unescape(substr($_, 3, -3)) : $_
   1.203 +    } split(/(~~~.*?~~~)/, $str));
   1.204 +    $str =~ s/(?<![%\\])%(\d+|\*)/\[_$1]/g;
   1.205 +
   1.206 +    return $str;
   1.207 +}
   1.208 +
   1.209 +sub unescape {
   1.210 +    join(',', map {
   1.211 +        /^%(?:\d+|\*)$/ ? ("_" . substr($_, 1)) : $_
   1.212 +    } split(/,/, $_[0]));
   1.213 +}
   1.214 +
   1.215 +# This subroutine was derived from Locale::Maketext::Gettext::readmo()
   1.216 +# under the Perl License; the original author is Yi Ma Mao (IMACAT).
   1.217 +sub parse_mo {
   1.218 +    my $content = shift;
   1.219 +    my $tmpl = (substr($content, 0, 4) eq "\xde\x12\x04\x95") ? 'V' : 'N';
   1.220 +
   1.221 +    # Check the MO format revision number
   1.222 +    # There is only one revision now: revision 0.
   1.223 +    return if unpack($tmpl, substr($content, 4, 4)) > 0;
   1.224 +
   1.225 +    my ($num, $offo, $offt);
   1.226 +    # Number of strings
   1.227 +    $num = unpack $tmpl, substr($content, 8, 4);
   1.228 +    # Offset to the beginning of the original strings
   1.229 +    $offo = unpack $tmpl, substr($content, 12, 4);
   1.230 +    # Offset to the beginning of the translated strings
   1.231 +    $offt = unpack $tmpl, substr($content, 16, 4);
   1.232 +
   1.233 +    my (@metadata, @ret);
   1.234 +    for (0 .. $num - 1) {
   1.235 +        my ($len, $off, $stro, $strt);
   1.236 +        # The first word is the length of the string
   1.237 +        $len = unpack $tmpl, substr($content, $offo+$_*8, 4);
   1.238 +        # The second word is the offset of the string
   1.239 +        $off = unpack $tmpl, substr($content, $offo+$_*8+4, 4);
   1.240 +        # Original string
   1.241 +        $stro = substr($content, $off, $len);
   1.242 +
   1.243 +        # The first word is the length of the string
   1.244 +        $len = unpack $tmpl, substr($content, $offt+$_*8, 4);
   1.245 +        # The second word is the offset of the string
   1.246 +        $off = unpack $tmpl, substr($content, $offt+$_*8+4, 4);
   1.247 +        # Translated string
   1.248 +        $strt = substr($content, $off, $len);
   1.249 +
   1.250 +        # Hash it
   1.251 +        push @metadata, parse_metadata($strt) if $stro eq '';
   1.252 +        push @ret, (map transform($_), $stro, $strt) if length $strt;
   1.253 +    }
   1.254 +
   1.255 +    return {@metadata, @ret};
   1.256 +}
   1.257 +
   1.258 +1;
   1.259 +
   1.260 +=head1 SEE ALSO
   1.261 +
   1.262 +L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
   1.263 +
   1.264 +=head1 AUTHORS
   1.265 +
   1.266 +Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>
   1.267 +
   1.268 +=head1 COPYRIGHT
   1.269 +
   1.270 +Copyright 2002, 2003, 2004 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.
   1.271 +
   1.272 +This program is free software; you can redistribute it and/or 
   1.273 +modify it under the same terms as Perl itself.
   1.274 +
   1.275 +See L<http://www.perl.com/perl/misc/Artistic.html>
   1.276 +
   1.277 +=cut