lib/CPAN/lib/Algorithm/Diff.pm
author Colas Nahaboo <colas@nahaboo.net>
Sat, 26 Jan 2008 15:50:53 +0100
changeset 0 414e01d06fd5
permissions -rw-r--r--
RELEASE 4.2.0 freetown
colas@0
     1
package Algorithm::Diff;
colas@0
     2
# Skip to first "=head" line for documentation.
colas@0
     3
use strict;
colas@0
     4
colas@0
     5
use integer;    # see below in _replaceNextLargerWith() for mod to make
colas@0
     6
                # if you don't use this
colas@0
     7
use vars qw( $VERSION @EXPORT_OK );
colas@0
     8
$VERSION = 1.19_01;
colas@0
     9
#          ^ ^^ ^^-- Incremented at will
colas@0
    10
#          | \+----- Incremented for non-trivial changes to features
colas@0
    11
#          \-------- Incremented for fundamental changes
colas@0
    12
require Exporter;
colas@0
    13
*import    = \&Exporter::import;
colas@0
    14
@EXPORT_OK = qw(
colas@0
    15
    prepare LCS LCDidx LCS_length
colas@0
    16
    diff sdiff compact_diff
colas@0
    17
    traverse_sequences traverse_balanced
colas@0
    18
);
colas@0
    19
colas@0
    20
# McIlroy-Hunt diff algorithm
colas@0
    21
# Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
colas@0
    22
# by Ned Konz, perl@bike-nomad.com
colas@0
    23
# Updates by Tye McQueen, http://perlmonks.org/?node=tye
colas@0
    24
colas@0
    25
# Create a hash that maps each element of $aCollection to the set of
colas@0
    26
# positions it occupies in $aCollection, restricted to the elements
colas@0
    27
# within the range of indexes specified by $start and $end.
colas@0
    28
# The fourth parameter is a subroutine reference that will be called to
colas@0
    29
# generate a string to use as a key.
colas@0
    30
# Additional parameters, if any, will be passed to this subroutine.
colas@0
    31
#
colas@0
    32
# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
colas@0
    33
colas@0
    34
sub _withPositionsOfInInterval
colas@0
    35
{
colas@0
    36
    my $aCollection = shift;    # array ref
colas@0
    37
    my $start       = shift;
colas@0
    38
    my $end         = shift;
colas@0
    39
    my $keyGen      = shift;
colas@0
    40
    my %d;
colas@0
    41
    my $index;
colas@0
    42
    for ( $index = $start ; $index <= $end ; $index++ )
colas@0
    43
    {
colas@0
    44
        my $element = $aCollection->[$index];
colas@0
    45
        my $key = &$keyGen( $element, @_ );
colas@0
    46
        if ( exists( $d{$key} ) )
colas@0
    47
        {
colas@0
    48
            unshift ( @{ $d{$key} }, $index );
colas@0
    49
        }
colas@0
    50
        else
colas@0
    51
        {
colas@0
    52
            $d{$key} = [$index];
colas@0
    53
        }
colas@0
    54
    }
colas@0
    55
    return wantarray ? %d : \%d;
colas@0
    56
}
colas@0
    57
colas@0
    58
# Find the place at which aValue would normally be inserted into the
colas@0
    59
# array. If that place is already occupied by aValue, do nothing, and
colas@0
    60
# return undef. If the place does not exist (i.e., it is off the end of
colas@0
    61
# the array), add it to the end, otherwise replace the element at that
colas@0
    62
# point with aValue.  It is assumed that the array's values are numeric.
colas@0
    63
# This is where the bulk (75%) of the time is spent in this module, so
colas@0
    64
# try to make it fast!
colas@0
    65
colas@0
    66
sub _replaceNextLargerWith
colas@0
    67
{
colas@0
    68
    my ( $array, $aValue, $high ) = @_;
colas@0
    69
    $high ||= $#$array;
colas@0
    70
colas@0
    71
    # off the end?
colas@0
    72
    if ( $high == -1 || $aValue > $array->[-1] )
colas@0
    73
    {
colas@0
    74
        push ( @$array, $aValue );
colas@0
    75
        return $high + 1;
colas@0
    76
    }
colas@0
    77
colas@0
    78
    # binary search for insertion point...
colas@0
    79
    my $low = 0;
colas@0
    80
    my $index;
colas@0
    81
    my $found;
colas@0
    82
    while ( $low <= $high )
colas@0
    83
    {
colas@0
    84
        $index = ( $high + $low ) / 2;
colas@0
    85
colas@0
    86
        # $index = int(( $high + $low ) / 2);  # without 'use integer'
colas@0
    87
        $found = $array->[$index];
colas@0
    88
colas@0
    89
        if ( $aValue == $found )
colas@0
    90
        {
colas@0
    91
            return undef;
colas@0
    92
        }
colas@0
    93
        elsif ( $aValue > $found )
colas@0
    94
        {
colas@0
    95
            $low = $index + 1;
colas@0
    96
        }
colas@0
    97
        else
colas@0
    98
        {
colas@0
    99
            $high = $index - 1;
colas@0
   100
        }
colas@0
   101
    }
colas@0
   102
colas@0
   103
    # now insertion point is in $low.
colas@0
   104
    $array->[$low] = $aValue;    # overwrite next larger
colas@0
   105
    return $low;
colas@0
   106
}
colas@0
   107
colas@0
   108
# This method computes the longest common subsequence in $a and $b.
colas@0
   109
colas@0
   110
# Result is array or ref, whose contents is such that
colas@0
   111
#   $a->[ $i ] == $b->[ $result[ $i ] ]
colas@0
   112
# foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
colas@0
   113
colas@0
   114
# An additional argument may be passed; this is a hash or key generating
colas@0
   115
# function that should return a string that uniquely identifies the given
colas@0
   116
# element.  It should be the case that if the key is the same, the elements
colas@0
   117
# will compare the same. If this parameter is undef or missing, the key
colas@0
   118
# will be the element as a string.
colas@0
   119
colas@0
   120
# By default, comparisons will use "eq" and elements will be turned into keys
colas@0
   121
# using the default stringizing operator '""'.
colas@0
   122
colas@0
   123
# Additional parameters, if any, will be passed to the key generation
colas@0
   124
# routine.
colas@0
   125
colas@0
   126
sub _longestCommonSubsequence
colas@0
   127
{
colas@0
   128
    my $a        = shift;    # array ref or hash ref
colas@0
   129
    my $b        = shift;    # array ref or hash ref
colas@0
   130
    my $counting = shift;    # scalar
colas@0
   131
    my $keyGen   = shift;    # code ref
colas@0
   132
    my $compare;             # code ref
colas@0
   133
colas@0
   134
    if ( ref($a) eq 'HASH' )
colas@0
   135
    {                        # prepared hash must be in $b
colas@0
   136
        my $tmp = $b;
colas@0
   137
        $b = $a;
colas@0
   138
        $a = $tmp;
colas@0
   139
    }
colas@0
   140
colas@0
   141
    # Check for bogus (non-ref) argument values
colas@0
   142
    if ( !ref($a) || !ref($b) )
colas@0
   143
    {
colas@0
   144
        my @callerInfo = caller(1);
colas@0
   145
        die 'error: must pass array or hash references to ' . $callerInfo[3];
colas@0
   146
    }
colas@0
   147
colas@0
   148
    # set up code refs
colas@0
   149
    # Note that these are optimized.
colas@0
   150
    if ( !defined($keyGen) )    # optimize for strings
colas@0
   151
    {
colas@0
   152
        $keyGen = sub { $_[0] };
colas@0
   153
        $compare = sub { my ( $a, $b ) = @_; $a eq $b };
colas@0
   154
    }
colas@0
   155
    else
colas@0
   156
    {
colas@0
   157
        $compare = sub {
colas@0
   158
            my $a = shift;
colas@0
   159
            my $b = shift;
colas@0
   160
            &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
colas@0
   161
        };
colas@0
   162
    }
colas@0
   163
colas@0
   164
    my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
colas@0
   165
    my ( $prunedCount, $bMatches ) = ( 0, {} );
colas@0
   166
colas@0
   167
    if ( ref($b) eq 'HASH' )    # was $bMatches prepared for us?
colas@0
   168
    {
colas@0
   169
        $bMatches = $b;
colas@0
   170
    }
colas@0
   171
    else
colas@0
   172
    {
colas@0
   173
        my ( $bStart, $bFinish ) = ( 0, $#$b );
colas@0
   174
colas@0
   175
        # First we prune off any common elements at the beginning
colas@0
   176
        while ( $aStart <= $aFinish
colas@0
   177
            and $bStart <= $bFinish
colas@0
   178
            and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
colas@0
   179
        {
colas@0
   180
            $matchVector->[ $aStart++ ] = $bStart++;
colas@0
   181
            $prunedCount++;
colas@0
   182
        }
colas@0
   183
colas@0
   184
        # now the end
colas@0
   185
        while ( $aStart <= $aFinish
colas@0
   186
            and $bStart <= $bFinish
colas@0
   187
            and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
colas@0
   188
        {
colas@0
   189
            $matchVector->[ $aFinish-- ] = $bFinish--;
colas@0
   190
            $prunedCount++;
colas@0
   191
        }
colas@0
   192
colas@0
   193
        # Now compute the equivalence classes of positions of elements
colas@0
   194
        $bMatches =
colas@0
   195
          _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
colas@0
   196
    }
colas@0
   197
    my $thresh = [];
colas@0
   198
    my $links  = [];
colas@0
   199
colas@0
   200
    my ( $i, $ai, $j, $k );
colas@0
   201
    for ( $i = $aStart ; $i <= $aFinish ; $i++ )
colas@0
   202
    {
colas@0
   203
        $ai = &$keyGen( $a->[$i], @_ );
colas@0
   204
        if ( exists( $bMatches->{$ai} ) )
colas@0
   205
        {
colas@0
   206
            $k = 0;
colas@0
   207
            for $j ( @{ $bMatches->{$ai} } )
colas@0
   208
            {
colas@0
   209
colas@0
   210
                # optimization: most of the time this will be true
colas@0
   211
                if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
colas@0
   212
                {
colas@0
   213
                    $thresh->[$k] = $j;
colas@0
   214
                }
colas@0
   215
                else
colas@0
   216
                {
colas@0
   217
                    $k = _replaceNextLargerWith( $thresh, $j, $k );
colas@0
   218
                }
colas@0
   219
colas@0
   220
                # oddly, it's faster to always test this (CPU cache?).
colas@0
   221
                if ( defined($k) )
colas@0
   222
                {
colas@0
   223
                    $links->[$k] =
colas@0
   224
                      [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
colas@0
   225
                }
colas@0
   226
            }
colas@0
   227
        }
colas@0
   228
    }
colas@0
   229
colas@0
   230
    if (@$thresh)
colas@0
   231
    {
colas@0
   232
        return $prunedCount + @$thresh if $counting;
colas@0
   233
        for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
colas@0
   234
        {
colas@0
   235
            $matchVector->[ $link->[1] ] = $link->[2];
colas@0
   236
        }
colas@0
   237
    }
colas@0
   238
    elsif ($counting)
colas@0
   239
    {
colas@0
   240
        return $prunedCount;
colas@0
   241
    }
colas@0
   242
colas@0
   243
    return wantarray ? @$matchVector : $matchVector;
colas@0
   244
}
colas@0
   245
colas@0
   246
sub traverse_sequences
colas@0
   247
{
colas@0
   248
    my $a                 = shift;          # array ref
colas@0
   249
    my $b                 = shift;          # array ref
colas@0
   250
    my $callbacks         = shift || {};
colas@0
   251
    my $keyGen            = shift;
colas@0
   252
    my $matchCallback     = $callbacks->{'MATCH'} || sub { };
colas@0
   253
    my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
colas@0
   254
    my $finishedACallback = $callbacks->{'A_FINISHED'};
colas@0
   255
    my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
colas@0
   256
    my $finishedBCallback = $callbacks->{'B_FINISHED'};
colas@0
   257
    my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
colas@0
   258
colas@0
   259
    # Process all the lines in @$matchVector
colas@0
   260
    my $lastA = $#$a;
colas@0
   261
    my $lastB = $#$b;
colas@0
   262
    my $bi    = 0;
colas@0
   263
    my $ai;
colas@0
   264
colas@0
   265
    for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
colas@0
   266
    {
colas@0
   267
        my $bLine = $matchVector->[$ai];
colas@0
   268
        if ( defined($bLine) )    # matched
colas@0
   269
        {
colas@0
   270
            &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
colas@0
   271
            &$matchCallback( $ai,    $bi++, @_ );
colas@0
   272
        }
colas@0
   273
        else
colas@0
   274
        {
colas@0
   275
            &$discardACallback( $ai, $bi, @_ );
colas@0
   276
        }
colas@0
   277
    }
colas@0
   278
colas@0
   279
    # The last entry (if any) processed was a match.
colas@0
   280
    # $ai and $bi point just past the last matching lines in their sequences.
colas@0
   281
colas@0
   282
    while ( $ai <= $lastA or $bi <= $lastB )
colas@0
   283
    {
colas@0
   284
colas@0
   285
        # last A?
colas@0
   286
        if ( $ai == $lastA + 1 and $bi <= $lastB )
colas@0
   287
        {
colas@0
   288
            if ( defined($finishedACallback) )
colas@0
   289
            {
colas@0
   290
                &$finishedACallback( $lastA, @_ );
colas@0
   291
                $finishedACallback = undef;
colas@0
   292
            }
colas@0
   293
            else
colas@0
   294
            {
colas@0
   295
                &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
colas@0
   296
            }
colas@0
   297
        }
colas@0
   298
colas@0
   299
        # last B?
colas@0
   300
        if ( $bi == $lastB + 1 and $ai <= $lastA )
colas@0
   301
        {
colas@0
   302
            if ( defined($finishedBCallback) )
colas@0
   303
            {
colas@0
   304
                &$finishedBCallback( $lastB, @_ );
colas@0
   305
                $finishedBCallback = undef;
colas@0
   306
            }
colas@0
   307
            else
colas@0
   308
            {
colas@0
   309
                &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
colas@0
   310
            }
colas@0
   311
        }
colas@0
   312
colas@0
   313
        &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
colas@0
   314
        &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
colas@0
   315
    }
colas@0
   316
colas@0
   317
    return 1;
colas@0
   318
}
colas@0
   319
colas@0
   320
sub traverse_balanced
colas@0
   321
{
colas@0
   322
    my $a                 = shift;              # array ref
colas@0
   323
    my $b                 = shift;              # array ref
colas@0
   324
    my $callbacks         = shift || {};
colas@0
   325
    my $keyGen            = shift;
colas@0
   326
    my $matchCallback     = $callbacks->{'MATCH'} || sub { };
colas@0
   327
    my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
colas@0
   328
    my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
colas@0
   329
    my $changeCallback    = $callbacks->{'CHANGE'};
colas@0
   330
    my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
colas@0
   331
colas@0
   332
    # Process all the lines in match vector
colas@0
   333
    my $lastA = $#$a;
colas@0
   334
    my $lastB = $#$b;
colas@0
   335
    my $bi    = 0;
colas@0
   336
    my $ai    = 0;
colas@0
   337
    my $ma    = -1;
colas@0
   338
    my $mb;
colas@0
   339
colas@0
   340
    while (1)
colas@0
   341
    {
colas@0
   342
colas@0
   343
        # Find next match indices $ma and $mb
colas@0
   344
        do {
colas@0
   345
            $ma++;
colas@0
   346
        } while(
colas@0
   347
                $ma <= $#$matchVector
colas@0
   348
            &&  !defined $matchVector->[$ma]
colas@0
   349
        );
colas@0
   350
colas@0
   351
        last if $ma > $#$matchVector;    # end of matchVector?
colas@0
   352
        $mb = $matchVector->[$ma];
colas@0
   353
colas@0
   354
        # Proceed with discard a/b or change events until
colas@0
   355
        # next match
colas@0
   356
        while ( $ai < $ma || $bi < $mb )
colas@0
   357
        {
colas@0
   358
colas@0
   359
            if ( $ai < $ma && $bi < $mb )
colas@0
   360
            {
colas@0
   361
colas@0
   362
                # Change
colas@0
   363
                if ( defined $changeCallback )
colas@0
   364
                {
colas@0
   365
                    &$changeCallback( $ai++, $bi++, @_ );
colas@0
   366
                }
colas@0
   367
                else
colas@0
   368
                {
colas@0
   369
                    &$discardACallback( $ai++, $bi, @_ );
colas@0
   370
                    &$discardBCallback( $ai, $bi++, @_ );
colas@0
   371
                }
colas@0
   372
            }
colas@0
   373
            elsif ( $ai < $ma )
colas@0
   374
            {
colas@0
   375
                &$discardACallback( $ai++, $bi, @_ );
colas@0
   376
            }
colas@0
   377
            else
colas@0
   378
            {
colas@0
   379
colas@0
   380
                # $bi < $mb
colas@0
   381
                &$discardBCallback( $ai, $bi++, @_ );
colas@0
   382
            }
colas@0
   383
        }
colas@0
   384
colas@0
   385
        # Match
colas@0
   386
        &$matchCallback( $ai++, $bi++, @_ );
colas@0
   387
    }
colas@0
   388
colas@0
   389
    while ( $ai <= $lastA || $bi <= $lastB )
colas@0
   390
    {
colas@0
   391
        if ( $ai <= $lastA && $bi <= $lastB )
colas@0
   392
        {
colas@0
   393
colas@0
   394
            # Change
colas@0
   395
            if ( defined $changeCallback )
colas@0
   396
            {
colas@0
   397
                &$changeCallback( $ai++, $bi++, @_ );
colas@0
   398
            }
colas@0
   399
            else
colas@0
   400
            {
colas@0
   401
                &$discardACallback( $ai++, $bi, @_ );
colas@0
   402
                &$discardBCallback( $ai, $bi++, @_ );
colas@0
   403
            }
colas@0
   404
        }
colas@0
   405
        elsif ( $ai <= $lastA )
colas@0
   406
        {
colas@0
   407
            &$discardACallback( $ai++, $bi, @_ );
colas@0
   408
        }
colas@0
   409
        else
colas@0
   410
        {
colas@0
   411
colas@0
   412
            # $bi <= $lastB
colas@0
   413
            &$discardBCallback( $ai, $bi++, @_ );
colas@0
   414
        }
colas@0
   415
    }
colas@0
   416
colas@0
   417
    return 1;
colas@0
   418
}
colas@0
   419
colas@0
   420
sub prepare
colas@0
   421
{
colas@0
   422
    my $a       = shift;    # array ref
colas@0
   423
    my $keyGen  = shift;    # code ref
colas@0
   424
colas@0
   425
    # set up code ref
colas@0
   426
    $keyGen = sub { $_[0] } unless defined($keyGen);
colas@0
   427
colas@0
   428
    return scalar _withPositionsOfInInterval( $a, 0, $#$a, $keyGen, @_ );
colas@0
   429
}
colas@0
   430
colas@0
   431
sub LCS
colas@0
   432
{
colas@0
   433
    my $a = shift;                  # array ref
colas@0
   434
    my $b = shift;                  # array ref or hash ref
colas@0
   435
    my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
colas@0
   436
    my @retval;
colas@0
   437
    my $i;
colas@0
   438
    for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
colas@0
   439
    {
colas@0
   440
        if ( defined( $matchVector->[$i] ) )
colas@0
   441
        {
colas@0
   442
            push ( @retval, $a->[$i] );
colas@0
   443
        }
colas@0
   444
    }
colas@0
   445
    return wantarray ? @retval : \@retval;
colas@0
   446
}
colas@0
   447
colas@0
   448
sub LCS_length
colas@0
   449
{
colas@0
   450
    my $a = shift;                          # array ref
colas@0
   451
    my $b = shift;                          # array ref or hash ref
colas@0
   452
    return _longestCommonSubsequence( $a, $b, 1, @_ );
colas@0
   453
}
colas@0
   454
colas@0
   455
sub LCSidx
colas@0
   456
{
colas@0
   457
    my $a= shift @_;
colas@0
   458
    my $b= shift @_;
colas@0
   459
    my $match= _longestCommonSubsequence( $a, $b, 0, @_ );
colas@0
   460
    my @am= grep defined $match->[$_], 0..$#$match;
colas@0
   461
    my @bm= @{$match}[@am];
colas@0
   462
    return \@am, \@bm;
colas@0
   463
}
colas@0
   464
colas@0
   465
sub compact_diff
colas@0
   466
{
colas@0
   467
    my $a= shift @_;
colas@0
   468
    my $b= shift @_;
colas@0
   469
    my( $am, $bm )= LCSidx( $a, $b, @_ );
colas@0
   470
    my @cdiff;
colas@0
   471
    my( $ai, $bi )= ( 0, 0 );
colas@0
   472
    push @cdiff, $ai, $bi;
colas@0
   473
    while( 1 ) {
colas@0
   474
        while(  @$am  &&  $ai == $am->[0]  &&  $bi == $bm->[0]  ) {
colas@0
   475
            shift @$am;
colas@0
   476
            shift @$bm;
colas@0
   477
            ++$ai, ++$bi;
colas@0
   478
        }
colas@0
   479
        push @cdiff, $ai, $bi;
colas@0
   480
        last   if  ! @$am;
colas@0
   481
        $ai = $am->[0];
colas@0
   482
        $bi = $bm->[0];
colas@0
   483
        push @cdiff, $ai, $bi;
colas@0
   484
    }
colas@0
   485
    push @cdiff, 0+@$a, 0+@$b
colas@0
   486
        if  $ai < @$a || $bi < @$b;
colas@0
   487
    return wantarray ? @cdiff : \@cdiff;
colas@0
   488
}
colas@0
   489
colas@0
   490
sub diff
colas@0
   491
{
colas@0
   492
    my $a      = shift;    # array ref
colas@0
   493
    my $b      = shift;    # array ref
colas@0
   494
    my $retval = [];
colas@0
   495
    my $hunk   = [];
colas@0
   496
    my $discard = sub {
colas@0
   497
        push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
colas@0
   498
    };
colas@0
   499
    my $add = sub {
colas@0
   500
        push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
colas@0
   501
    };
colas@0
   502
    my $match = sub {
colas@0
   503
        push @$retval, $hunk
colas@0
   504
            if 0 < @$hunk;
colas@0
   505
        $hunk = []
colas@0
   506
    };
colas@0
   507
    traverse_sequences( $a, $b,
colas@0
   508
        { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
colas@0
   509
    &$match();
colas@0
   510
    return wantarray ? @$retval : $retval;
colas@0
   511
}
colas@0
   512
colas@0
   513
sub sdiff
colas@0
   514
{
colas@0
   515
    my $a      = shift;    # array ref
colas@0
   516
    my $b      = shift;    # array ref
colas@0
   517
    my $retval = [];
colas@0
   518
    my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
colas@0
   519
    my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
colas@0
   520
    my $change = sub {
colas@0
   521
        push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
colas@0
   522
    };
colas@0
   523
    my $match = sub {
colas@0
   524
        push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
colas@0
   525
    };
colas@0
   526
    traverse_balanced(
colas@0
   527
        $a,
colas@0
   528
        $b,
colas@0
   529
        {
colas@0
   530
            MATCH     => $match,
colas@0
   531
            DISCARD_A => $discard,
colas@0
   532
            DISCARD_B => $add,
colas@0
   533
            CHANGE    => $change,
colas@0
   534
        },
colas@0
   535
        @_
colas@0
   536
    );
colas@0
   537
    return wantarray ? @$retval : $retval;
colas@0
   538
}
colas@0
   539
colas@0
   540
########################################
colas@0
   541
my $Root= __PACKAGE__;
colas@0
   542
package Algorithm::Diff::_impl;
colas@0
   543
use strict;
colas@0
   544
colas@0
   545
sub _Idx()  { 0 } # $me->[_Idx]: Ref to array of hunk indices
colas@0
   546
            # 1   # $me->[1]: Ref to first sequence
colas@0
   547
            # 2   # $me->[2]: Ref to second sequence
colas@0
   548
sub _End()  { 3 } # $me->[_End]: Diff between forward and reverse pos
colas@0
   549
sub _Same() { 4 } # $me->[_Same]: 1 if pos 1 contains unchanged items
colas@0
   550
sub _Base() { 5 } # $me->[_Base]: Added to range's min and max
colas@0
   551
sub _Pos()  { 6 } # $me->[_Pos]: Which hunk is currently selected
colas@0
   552
sub _Off()  { 7 } # $me->[_Off]: Offset into _Idx for current position
colas@0
   553
sub _Min() { -2 } # Added to _Off to get min instead of max+1
colas@0
   554
colas@0
   555
sub Die
colas@0
   556
{
colas@0
   557
    require Carp;
colas@0
   558
    Carp::confess( @_ );
colas@0
   559
}
colas@0
   560
colas@0
   561
sub _ChkPos
colas@0
   562
{
colas@0
   563
    my( $me )= @_;
colas@0
   564
    return   if  $me->[_Pos];
colas@0
   565
    my $meth= ( caller(1) )[3];
colas@0
   566
    Die( "Called $meth on 'reset' object" );
colas@0
   567
}
colas@0
   568
colas@0
   569
sub _ChkSeq
colas@0
   570
{
colas@0
   571
    my( $me, $seq )= @_;
colas@0
   572
    return $seq + $me->[_Off]
colas@0
   573
        if  1 == $seq  ||  2 == $seq;
colas@0
   574
    my $meth= ( caller(1) )[3];
colas@0
   575
    Die( "$meth: Invalid sequence number ($seq); must be 1 or 2" );
colas@0
   576
}
colas@0
   577
colas@0
   578
sub getObjPkg
colas@0
   579
{
colas@0
   580
    my( $us )= @_;
colas@0
   581
    return ref $us   if  ref $us;
colas@0
   582
    return $us . "::_obj";
colas@0
   583
}
colas@0
   584
colas@0
   585
sub new
colas@0
   586
{
colas@0
   587
    my( $us, $seq1, $seq2, $opts ) = @_;
colas@0
   588
    my @args;
colas@0
   589
    for( $opts->{keyGen} ) {
colas@0
   590
        push @args, $_   if  $_;
colas@0
   591
    }
colas@0
   592
    for( $opts->{keyGenArgs} ) {
colas@0
   593
        push @args, @$_   if  $_;
colas@0
   594
    }
colas@0
   595
    my $cdif= Algorithm::Diff::compact_diff( $seq1, $seq2, @args );
colas@0
   596
    my $same= 1;
colas@0
   597
    if(  0 == $cdif->[2]  &&  0 == $cdif->[3]  ) {
colas@0
   598
        $same= 0;
colas@0
   599
        splice @$cdif, 0, 2;
colas@0
   600
    }
colas@0
   601
    my @obj= ( $cdif, $seq1, $seq2 );
colas@0
   602
    $obj[_End] = (1+@$cdif)/2;
colas@0
   603
    $obj[_Same] = $same;
colas@0
   604
    $obj[_Base] = 0;
colas@0
   605
    my $me = bless \@obj, $us->getObjPkg();
colas@0
   606
    $me->Reset( 0 );
colas@0
   607
    return $me;
colas@0
   608
}
colas@0
   609
colas@0
   610
sub Reset
colas@0
   611
{
colas@0
   612
    my( $me, $pos )= @_;
colas@0
   613
    $pos= int( $pos || 0 );
colas@0
   614
    $pos += $me->[_End]
colas@0
   615
        if  $pos < 0;
colas@0
   616
    $pos= 0
colas@0
   617
        if  $pos < 0  ||  $me->[_End] <= $pos;
colas@0
   618
    $me->[_Pos]= $pos || !1;
colas@0
   619
    $me->[_Off]= 2*$pos - 1;
colas@0
   620
    return $me;
colas@0
   621
}
colas@0
   622
colas@0
   623
sub Base
colas@0
   624
{
colas@0
   625
    my( $me, $base )= @_;
colas@0
   626
    my $oldBase= $me->[_Base];
colas@0
   627
    $me->[_Base]= 0+$base   if  defined $base;
colas@0
   628
    return $oldBase;
colas@0
   629
}
colas@0
   630
colas@0
   631
sub Copy
colas@0
   632
{
colas@0
   633
    my( $me, $pos, $base )= @_;
colas@0
   634
    my @obj= @$me;
colas@0
   635
    my $you= bless \@obj, ref($me);
colas@0
   636
    $you->Reset( $pos )   if  defined $pos;
colas@0
   637
    $you->Base( $base );
colas@0
   638
    return $you;
colas@0
   639
}
colas@0
   640
colas@0
   641
sub Next {
colas@0
   642
    my( $me, $steps )= @_;
colas@0
   643
    $steps= 1   if  ! defined $steps;
colas@0
   644
    if( $steps ) {
colas@0
   645
        my $pos= $me->[_Pos];
colas@0
   646
        my $new= $pos + $steps;
colas@0
   647
        $new= 0   if  $pos  &&  $new < 0;
colas@0
   648
        $me->Reset( $new )
colas@0
   649
    }
colas@0
   650
    return $me->[_Pos];
colas@0
   651
}
colas@0
   652
colas@0
   653
sub Prev {
colas@0
   654
    my( $me, $steps )= @_;
colas@0
   655
    $steps= 1   if  ! defined $steps;
colas@0
   656
    my $pos= $me->Next(-$steps);
colas@0
   657
    $pos -= $me->[_End]   if  $pos;
colas@0
   658
    return $pos;
colas@0
   659
}
colas@0
   660
colas@0
   661
sub Diff {
colas@0
   662
    my( $me )= @_;
colas@0
   663
    $me->_ChkPos();
colas@0
   664
    return 0   if  $me->[_Same] == ( 1 & $me->[_Pos] );
colas@0
   665
    my $ret= 0;
colas@0
   666
    my $off= $me->[_Off];
colas@0
   667
    for my $seq ( 1, 2 ) {
colas@0
   668
        $ret |= $seq
colas@0
   669
            if  $me->[_Idx][ $off + $seq + _Min ]
colas@0
   670
            <   $me->[_Idx][ $off + $seq ];
colas@0
   671
    }
colas@0
   672
    return $ret;
colas@0
   673
}
colas@0
   674
colas@0
   675
sub Min {
colas@0
   676
    my( $me, $seq, $base )= @_;
colas@0
   677
    $me->_ChkPos();
colas@0
   678
    my $off= $me->_ChkSeq($seq);
colas@0
   679
    $base= $me->[_Base] if !defined $base;
colas@0
   680
    return $base + $me->[_Idx][ $off + _Min ];
colas@0
   681
}
colas@0
   682
colas@0
   683
sub Max {
colas@0
   684
    my( $me, $seq, $base )= @_;
colas@0
   685
    $me->_ChkPos();
colas@0
   686
    my $off= $me->_ChkSeq($seq);
colas@0
   687
    $base= $me->[_Base] if !defined $base;
colas@0
   688
    return $base + $me->[_Idx][ $off ] -1;
colas@0
   689
}
colas@0
   690
colas@0
   691
sub Range {
colas@0
   692
    my( $me, $seq, $base )= @_;
colas@0
   693
    $me->_ChkPos();
colas@0
   694
    my $off = $me->_ChkSeq($seq);
colas@0
   695
    if( !wantarray ) {
colas@0
   696
        return  $me->[_Idx][ $off ]
colas@0
   697
            -   $me->[_Idx][ $off + _Min ];
colas@0
   698
    }
colas@0
   699
    $base= $me->[_Base] if !defined $base;
colas@0
   700
    return  ( $base + $me->[_Idx][ $off + _Min ] )
colas@0
   701
        ..  ( $base + $me->[_Idx][ $off ] - 1 );
colas@0
   702
}
colas@0
   703
colas@0
   704
sub Items {
colas@0
   705
    my( $me, $seq )= @_;
colas@0
   706
    $me->_ChkPos();
colas@0
   707
    my $off = $me->_ChkSeq($seq);
colas@0
   708
    if( !wantarray ) {
colas@0
   709
        return  $me->[_Idx][ $off ]
colas@0
   710
            -   $me->[_Idx][ $off + _Min ];
colas@0
   711
    }
colas@0
   712
    return
colas@0
   713
        @{$me->[$seq]}[
colas@0
   714
                $me->[_Idx][ $off + _Min ]
colas@0
   715
            ..  ( $me->[_Idx][ $off ] - 1 )
colas@0
   716
        ];
colas@0
   717
}
colas@0
   718
colas@0
   719
sub Same {
colas@0
   720
    my( $me )= @_;
colas@0
   721
    $me->_ChkPos();
colas@0
   722
    return wantarray ? () : 0
colas@0
   723
        if  $me->[_Same] != ( 1 & $me->[_Pos] );
colas@0
   724
    return $me->Items(1);
colas@0
   725
}
colas@0
   726
colas@0
   727
my %getName;
colas@0
   728
BEGIN {
colas@0
   729
    %getName= (
colas@0
   730
        same => \&Same,
colas@0
   731
        diff => \&Diff,
colas@0
   732
        base => \&Base,
colas@0
   733
        min  => \&Min,
colas@0
   734
        max  => \&Max,
colas@0
   735
        range=> \&Range,
colas@0
   736
        items=> \&Items, # same thing
colas@0
   737
    );
colas@0
   738
}
colas@0
   739
colas@0
   740
sub Get
colas@0
   741
{
colas@0
   742
    my $me= shift @_;
colas@0
   743
    $me->_ChkPos();
colas@0
   744
    my @value;
colas@0
   745
    for my $arg (  @_  ) {
colas@0
   746
        for my $word (  split ' ', $arg  ) {
colas@0
   747
            my $meth;
colas@0
   748
            if(     $word !~ /^(-?\d+)?([a-zA-Z]+)([12])?$/
colas@0
   749
                ||  not  $meth= $getName{ lc $2 }
colas@0
   750
            ) {
colas@0
   751
                Die( $Root, ", Get: Invalid request ($word)" );
colas@0
   752
            }
colas@0
   753
            my( $base, $name, $seq )= ( $1, $2, $3 );
colas@0
   754
            push @value, scalar(
colas@0
   755
                4 == length($name)
colas@0
   756
                    ? $meth->( $me )
colas@0
   757
                    : $meth->( $me, $seq, $base )
colas@0
   758
            );
colas@0
   759
        }
colas@0
   760
    }
colas@0
   761
    if(  wantarray  ) {
colas@0
   762
        return @value;
colas@0
   763
    } elsif(  1 == @value  ) {
colas@0
   764
        return $value[0];
colas@0
   765
    }
colas@0
   766
    Die( 0+@value, " values requested from ",
colas@0
   767
        $Root, "'s Get in scalar context" );
colas@0
   768
}
colas@0
   769
colas@0
   770
colas@0
   771
my $Obj= getObjPkg($Root);
colas@0
   772
no strict 'refs';
colas@0
   773
colas@0
   774
for my $meth (  qw( new getObjPkg )  ) {
colas@0
   775
    *{$Root."::".$meth} = \&{$meth};
colas@0
   776
    *{$Obj ."::".$meth} = \&{$meth};
colas@0
   777
}
colas@0
   778
for my $meth (  qw(
colas@0
   779
    Next Prev Reset Copy Base Diff
colas@0
   780
    Same Items Range Min Max Get
colas@0
   781
    _ChkPos _ChkSeq
colas@0
   782
)  ) {
colas@0
   783
    *{$Obj."::".$meth} = \&{$meth};
colas@0
   784
}
colas@0
   785
colas@0
   786
1;
colas@0
   787
__END__
colas@0
   788
colas@0
   789
=head1 NAME
colas@0
   790
colas@0
   791
Algorithm::Diff - Compute `intelligent' differences between two files / lists
colas@0
   792
colas@0
   793
=head1 SYNOPSIS
colas@0
   794
colas@0
   795
    require Algorithm::Diff;
colas@0
   796
colas@0
   797
    # This example produces traditional 'diff' output:
colas@0
   798
colas@0
   799
    my $diff = Algorithm::Diff->new( \@seq1, \@seq2 );
colas@0
   800
colas@0
   801
    $diff->Base( 1 );   # Return line numbers, not indices
colas@0
   802
    while(  $diff->Next()  ) {
colas@0
   803
        next   if  $diff->Same();
colas@0
   804
        my $sep = '';
colas@0
   805
        if(  ! $diff->Items(2)  ) {
colas@0
   806
            sprintf "%d,%dd%d\n",
colas@0
   807
                $diff->Get(qw( Min1 Max1 Max2 ));
colas@0
   808
        } elsif(  ! $diff->Items(1)  ) {
colas@0
   809
            sprint "%da%d,%d\n",
colas@0
   810
                $diff->Get(qw( Max1 Min2 Max2 ));
colas@0
   811
        } else {
colas@0
   812
            $sep = "---\n";
colas@0
   813
            sprintf "%d,%dc%d,%d\n",
colas@0
   814
                $diff->Get(qw( Min1 Max1 Min2 Max2 ));
colas@0
   815
        }
colas@0
   816
        print "< $_"   for  $diff->Items(1);
colas@0
   817
        print $sep;
colas@0
   818
        print "> $_"   for  $diff->Items(2);
colas@0
   819
    }
colas@0
   820
colas@0
   821
colas@0
   822
    # Alternate interfaces:
colas@0
   823
colas@0
   824
    use Algorithm::Diff qw(
colas@0
   825
        LCS LCS_length LCSidx
colas@0
   826
        diff sdiff compact_diff
colas@0
   827
        traverse_sequences traverse_balanced );
colas@0
   828
colas@0
   829
    @lcs    = LCS( \@seq1, \@seq2 );
colas@0
   830
    $lcsref = LCS( \@seq1, \@seq2 );
colas@0
   831
    $count  = LCS_length( \@seq1, \@seq2 );
colas@0
   832
colas@0
   833
    ( $seq1idxref, $seq2idxref ) = LCSidx( \@seq1, \@seq2 );
colas@0
   834
colas@0
   835
colas@0
   836
    # Complicated interfaces:
colas@0
   837
colas@0
   838
    @diffs  = diff( \@seq1, \@seq2 );
colas@0
   839
colas@0
   840
    @sdiffs = sdiff( \@seq1, \@seq2 );
colas@0
   841
colas@0
   842
    @cdiffs = compact_diff( \@seq1, \@seq2 );
colas@0
   843
colas@0
   844
    traverse_sequences(
colas@0
   845
        \@seq1,
colas@0
   846
        \@seq2,
colas@0
   847
        {   MATCH     => \&callback1,
colas@0
   848
            DISCARD_A => \&callback2,
colas@0
   849
            DISCARD_B => \&callback3,
colas@0
   850
        },
colas@0
   851
        \&key_generator,
colas@0
   852
        @extra_args,
colas@0
   853
    );
colas@0
   854
colas@0
   855
    traverse_balanced(
colas@0
   856
        \@seq1,
colas@0
   857
        \@seq2,
colas@0
   858
        {   MATCH     => \&callback1,
colas@0
   859
            DISCARD_A => \&callback2,
colas@0
   860
            DISCARD_B => \&callback3,
colas@0
   861
            CHANGE    => \&callback4,
colas@0
   862
        },
colas@0
   863
        \&key_generator,
colas@0
   864
        @extra_args,
colas@0
   865
    );
colas@0
   866
colas@0
   867
colas@0
   868
=head1 INTRODUCTION
colas@0
   869
colas@0
   870
(by Mark-Jason Dominus)
colas@0
   871
colas@0
   872
I once read an article written by the authors of C<diff>; they said
colas@0
   873
that they worked very hard on the algorithm until they found the
colas@0
   874
right one.
colas@0
   875
colas@0
   876
I think what they ended up using (and I hope someone will correct me,
colas@0
   877
because I am not very confident about this) was the `longest common
colas@0
   878
subsequence' method.  In the LCS problem, you have two sequences of
colas@0
   879
items:
colas@0
   880
colas@0
   881
    a b c d f g h j q z
colas@0
   882
colas@0
   883
    a b c d e f g i j k r x y z
colas@0
   884
colas@0
   885
and you want to find the longest sequence of items that is present in
colas@0
   886
both original sequences in the same order.  That is, you want to find
colas@0
   887
a new sequence I<S> which can be obtained from the first sequence by
colas@0
   888
deleting some items, and from the secend sequence by deleting other
colas@0
   889
items.  You also want I<S> to be as long as possible.  In this case I<S>
colas@0
   890
is
colas@0
   891
colas@0
   892
    a b c d f g j z
colas@0
   893
colas@0
   894
From there it's only a small step to get diff-like output:
colas@0
   895
colas@0
   896
    e   h i   k   q r x y
colas@0
   897
    +   - +   +   - + + +
colas@0
   898
colas@0
   899
This module solves the LCS problem.  It also includes a canned function
colas@0
   900
to generate C<diff>-like output.
colas@0
   901
colas@0
   902
It might seem from the example above that the LCS of two sequences is
colas@0
   903
always pretty obvious, but that's not always the case, especially when
colas@0
   904
the two sequences have many repeated elements.  For example, consider
colas@0
   905
colas@0
   906
    a x b y c z p d q
colas@0
   907
    a b c a x b y c z
colas@0
   908
colas@0
   909
A naive approach might start by matching up the C<a> and C<b> that
colas@0
   910
appear at the beginning of each sequence, like this:
colas@0
   911
colas@0
   912
    a x b y c         z p d q
colas@0
   913
    a   b   c a b y c z
colas@0
   914
colas@0
   915
This finds the common subsequence C<a b c z>.  But actually, the LCS
colas@0
   916
is C<a x b y c z>:
colas@0
   917
colas@0
   918
          a x b y c z p d q
colas@0
   919
    a b c a x b y c z
colas@0
   920
colas@0
   921
or
colas@0
   922
colas@0
   923
    a       x b y c z p d q
colas@0
   924
    a b c a x b y c z
colas@0
   925
colas@0
   926
=head1 USAGE
colas@0
   927
colas@0
   928
(See also the README file and several example
colas@0
   929
scripts include with this module.)
colas@0
   930
colas@0
   931
This module now provides an object-oriented interface that uses less
colas@0
   932
memory and is easier to use than most of the previous procedural
colas@0
   933
interfaces.  It also still provides several exportable functions.  We'll
colas@0
   934
deal with these in ascending order of difficulty:  C<LCS>,
colas@0
   935
C<LCS_length>, C<LCSidx>, OO interface, C<prepare>, C<diff>, C<sdiff>,
colas@0
   936
C<traverse_sequences>, and C<traverse_balanced>.
colas@0
   937
colas@0
   938
=head2 C<LCS>
colas@0
   939
colas@0
   940
Given references to two lists of items, LCS returns an array containing
colas@0
   941
their longest common subsequence.  In scalar context, it returns a
colas@0
   942
reference to such a list.
colas@0
   943
colas@0
   944
    @lcs    = LCS( \@seq1, \@seq2 );
colas@0
   945
    $lcsref = LCS( \@seq1, \@seq2 );
colas@0
   946
colas@0
   947
C<LCS> may be passed an optional third parameter; this is a CODE
colas@0
   948
reference to a key generation function.  See L</KEY GENERATION
colas@0
   949
FUNCTIONS>.
colas@0
   950
colas@0
   951
    @lcs    = LCS( \@seq1, \@seq2, \&keyGen, @args );
colas@0
   952
    $lcsref = LCS( \@seq1, \@seq2, \&keyGen, @args );
colas@0
   953
colas@0
   954
Additional parameters, if any, will be passed to the key generation
colas@0
   955
routine.
colas@0
   956
colas@0
   957
=head2 C<LCS_length>
colas@0
   958
colas@0
   959
This is just like C<LCS> except it only returns the length of the
colas@0
   960
longest common subsequence.  This provides a performance gain of about
colas@0
   961
9% compared to C<LCS>.
colas@0
   962
colas@0
   963
=head2 C<LCSidx>
colas@0
   964
colas@0
   965
Like C<LCS> except it returns references to two arrays.  The first array
colas@0
   966
contains the indices into @seq1 where the LCS items are located.  The
colas@0
   967
second array contains the indices into @seq2 where the LCS items are located.
colas@0
   968
colas@0
   969
Therefore, the following three lists will contain the same values:
colas@0
   970
colas@0
   971
    my( $idx1, $idx2 ) = LCSidx( \@seq1, \@seq2 );
colas@0
   972
    my @list1 = @seq1[ @$idx1 ];
colas@0
   973
    my @list2 = @seq2[ @$idx2 ];
colas@0
   974
    my @list3 = LCS( \@seq1, \@seq2 );
colas@0
   975
colas@0
   976
=head2 C<new>
colas@0
   977
colas@0
   978
    $diff = Algorithm::Diffs->new( \@seq1, \@seq2 );
colas@0
   979
    $diff = Algorithm::Diffs->new( \@seq1, \@seq2, \%opts );
colas@0
   980
colas@0
   981
C<new> computes the smallest set of additions and deletions necessary
colas@0
   982
to turn the first sequence into the second and compactly records them
colas@0
   983
in the object.
colas@0
   984
colas@0
   985
You use the object to iterate over I<hunks>, where each hunk represents
colas@0
   986
a contiguous section of items which should be added, deleted, replaced,
colas@0
   987
or left unchanged.
colas@0
   988
colas@0
   989
=over 4
colas@0
   990
colas@0
   991
The following summary of all of the methods looks a lot like Perl code
colas@0
   992
but some of the symbols have different meanings:
colas@0
   993
colas@0
   994
    [ ]     Encloses optional arguments
colas@0
   995
    :       Is followed by the default value for an optional argument
colas@0
   996
    |       Separates alternate return results
colas@0
   997
colas@0
   998
Method summary:
colas@0
   999
colas@0
  1000
    $obj        = Algorithm::Diff->new( \@seq1, \@seq2, [ \%opts ] );
colas@0
  1001
    $pos        = $obj->Next(  [ $count : 1 ] );
colas@0
  1002
    $revPos     = $obj->Prev(  [ $count : 1 ] );
colas@0
  1003
    $obj        = $obj->Reset( [ $pos : 0 ] );
colas@0
  1004
    $copy       = $obj->Copy(  [ $pos, [ $newBase ] ] );
colas@0
  1005
    $oldBase    = $obj->Base(  [ $newBase ] );
colas@0
  1006
colas@0
  1007
Note that all of the following methods C<die> if used on an object that
colas@0
  1008
is "reset" (not currently pointing at any hunk).
colas@0
  1009
colas@0
  1010
    $bits       = $obj->Diff(  );
colas@0
  1011
    @items|$cnt = $obj->Same(  );
colas@0
  1012
    @items|$cnt = $obj->Items( $seqNum );
colas@0
  1013
    @idxs |$cnt = $obj->Range( $seqNum, [ $base ] );
colas@0
  1014
    $minIdx     = $obj->Min(   $seqNum, [ $base ] );
colas@0
  1015
    $maxIdx     = $obj->Max(   $seqNum, [ $base ] );
colas@0
  1016
    @values     = $obj->Get(   @names );
colas@0
  1017
colas@0
  1018
Passing in C<undef> for an optional argument is always treated the same
colas@0
  1019
as if no argument were passed in.
colas@0
  1020
colas@0
  1021
=item C<Next>
colas@0
  1022
colas@0
  1023
    $pos = $diff->Next();    # Move forward 1 hunk
colas@0
  1024
    $pos = $diff->Next( 2 ); # Move forward 2 hunks
colas@0
  1025
    $pos = $diff->Next(-5);  # Move backward 5 hunks
colas@0
  1026
colas@0
  1027
C<Next> moves the object to point at the next hunk.  The object starts
colas@0
  1028
out "reset", which means it isn't pointing at any hunk.  If the object
colas@0
  1029
is reset, then C<Next()> moves to the first hunk.
colas@0
  1030
colas@0
  1031
C<Next> returns a true value iff the move didn't go past the last hunk.
colas@0
  1032
So C<Next(0)> will return true iff the object is not reset.
colas@0
  1033
colas@0
  1034
Actually, C<Next> returns the object's new position, which is a number
colas@0
  1035
between 1 and the number of hunks (inclusive), or returns a false value.
colas@0
  1036
colas@0
  1037
=item C<Prev>
colas@0
  1038
colas@0
  1039
C<Prev($N)> is almost identical to C<Next(-$N)>; it moves to the $Nth
colas@0
  1040
previous hunk.  On a 'reset' object, C<Prev()> [and C<Next(-1)>] move
colas@0
  1041
to the last hunk.
colas@0
  1042
colas@0
  1043
The position returned by C<Prev> is relative to the I<end> of the
colas@0
  1044
hunks; -1 for the last hunk, -2 for the second-to-last, etc.
colas@0
  1045
colas@0
  1046
=item C<Reset>
colas@0
  1047
colas@0
  1048
    $diff->Reset();     # Reset the object's position
colas@0
  1049
    $diff->Reset($pos); # Move to the specified hunk
colas@0
  1050
    $diff->Reset(1);    # Move to the first hunk
colas@0
  1051
    $diff->Reset(-1);   # Move to the last hunk
colas@0
  1052
colas@0
  1053
C<Reset> returns the object, so, for example, you could use
colas@0
  1054
C<< $diff->Reset()->Next(-1) >> to get the number of hunks.
colas@0
  1055
colas@0
  1056
=item C<Copy>
colas@0
  1057
colas@0
  1058
    $copy = $diff->Copy( $newPos, $newBase );
colas@0
  1059
colas@0
  1060
C<Copy> returns a copy of the object.  The copy and the orignal object
colas@0
  1061
share most of their data, so making copies takes very little memory.
colas@0
  1062
The copy maintains its own position (separate from the original), which
colas@0
  1063
is the main purpose of copies.  It also maintains its own base.
colas@0
  1064
colas@0
  1065
By default, the copy's position starts out the same as the original
colas@0
  1066
object's position.  But C<Copy> takes an optional first argument to set the
colas@0
  1067
new position, so the following three snippets are equivalent:
colas@0
  1068
colas@0
  1069
    $copy = $diff->Copy($pos);
colas@0
  1070
colas@0
  1071
    $copy = $diff->Copy();
colas@0
  1072
    $copy->Reset($pos);
colas@0
  1073
colas@0
  1074
    $copy = $diff->Copy()->Reset($pos);
colas@0
  1075
colas@0
  1076
C<Copy> takes an optional second argument to set the base for
colas@0
  1077
the copy.  If you wish to change the base of the copy but leave
colas@0
  1078
the position the same as in the original, here are two
colas@0
  1079
equivalent ways:
colas@0
  1080
colas@0
  1081
    $copy = $diff->Copy();
colas@0
  1082
    $copy->Base( 0 );
colas@0
  1083
colas@0
  1084
    $copy = $diff->Copy(undef,0);
colas@0
  1085
colas@0
  1086
Here are two equivalent way to get a "reset" copy:
colas@0
  1087
colas@0
  1088
    $copy = $diff->Copy(0);
colas@0
  1089
colas@0
  1090
    $copy = $diff->Copy()->Reset();
colas@0
  1091
colas@0
  1092
=item C<Diff>
colas@0
  1093
colas@0
  1094
    $bits = $obj->Diff();
colas@0
  1095
colas@0
  1096
C<Diff> returns a true value iff the current hunk contains items that are
colas@0
  1097
different between the two sequences.  It actually returns one of the
colas@0
  1098
follow 4 values:
colas@0
  1099
colas@0
  1100
=over 4
colas@0
  1101
colas@0
  1102
=item 3
colas@0
  1103
colas@0
  1104
C<3==(1|2)>.  This hunk contains items from @seq1 and the items
colas@0
  1105
from @seq2 that should replace them.  Both sequence 1 and 2
colas@0
  1106
contain changed items so both the 1 and 2 bits are set.
colas@0
  1107
colas@0
  1108
=item 2
colas@0
  1109
colas@0
  1110
This hunk only contains items from @seq2 that should be inserted (not
colas@0
  1111
items from @seq1).  Only sequence 2 contains changed items so only the 2
colas@0
  1112
bit is set.
colas@0
  1113
colas@0
  1114
=item 1
colas@0
  1115
colas@0
  1116
This hunk only contains items from @seq1 that should be deleted (not
colas@0
  1117
items from @seq2).  Only sequence 1 contains changed items so only the 1
colas@0
  1118
bit is set.
colas@0
  1119
colas@0
  1120
=item 0
colas@0
  1121
colas@0
  1122
This means that the items in this hunk are the same in both sequences.
colas@0
  1123
Neither sequence 1 nor 2 contain changed items so neither the 1 nor the
colas@0
  1124
2 bits are set.
colas@0
  1125
colas@0
  1126
=back
colas@0
  1127
colas@0
  1128
=item C<Same>
colas@0
  1129
colas@0
  1130
C<Same> returns a true value iff the current hunk contains items that
colas@0
  1131
are the same in both sequences.  It actually returns the list of items
colas@0
  1132
if they are the same or an emty list if they aren't.  In a scalar
colas@0
  1133
context, it returns the size of the list.
colas@0
  1134
colas@0
  1135
=item C<Items>
colas@0
  1136
colas@0
  1137
    $count = $diff->Items(2);
colas@0
  1138
    @items = $diff->Items($seqNum);
colas@0
  1139
colas@0
  1140
C<Items> returns the (number of) items from the specified sequence that
colas@0
  1141
are part of the current hunk.
colas@0
  1142
colas@0
  1143
If the current hunk contains only insertions, then
colas@0
  1144
C<< $diff->Items(1) >> will return an empty list (0 in a scalar conext).
colas@0
  1145
If the current hunk contains only deletions, then C<< $diff->Items(2) >>
colas@0
  1146
will return an empty list (0 in a scalar conext).
colas@0
  1147
colas@0
  1148
If the hunk contains replacements, then both C<< $diff->Items(1) >> and
colas@0
  1149
C<< $diff->Items(2) >> will return different, non-empty lists.
colas@0
  1150
colas@0
  1151
Otherwise, the hunk contains identical items and all of the following
colas@0
  1152
will return the same lists:
colas@0
  1153
colas@0
  1154
    @items = $diff->Items(1);
colas@0
  1155
    @items = $diff->Items(2);
colas@0
  1156
    @items = $diff->Same();
colas@0
  1157
colas@0
  1158
=item C<Range>
colas@0
  1159
colas@0
  1160
    $count = $diff->Range( $seqNum );
colas@0
  1161
    @indices = $diff->Range( $seqNum );
colas@0
  1162
    @indices = $diff->Range( $seqNum, $base );
colas@0
  1163
colas@0
  1164
C<Range> is like C<Items> except that it returns a list of I<indices> to
colas@0
  1165
the items rather than the items themselves.  By default, the index of
colas@0
  1166
the first item (in each sequence) is 0 but this can be changed by
colas@0
  1167
calling the C<Base> method.  So, by default, the following two snippets
colas@0
  1168
return the same lists:
colas@0
  1169
colas@0
  1170
    @list = $diff->Items(2);
colas@0
  1171
    @list = @seq2[ $diff->Range(2) ];
colas@0
  1172
colas@0
  1173
You can also specify the base to use as the second argument.  So the
colas@0
  1174
following two snippets I<always> return the same lists:
colas@0
  1175
colas@0
  1176
    @list = $diff->Items(1);
colas@0
  1177
    @list = @seq1[ $diff->Range(1,0) ];
colas@0
  1178
colas@0
  1179
=item C<Base>
colas@0
  1180
colas@0
  1181
    $curBase = $diff->Base();
colas@0
  1182
    $oldBase = $diff->Base($newBase);
colas@0
  1183
colas@0
  1184
C<Base> sets and/or returns the current base (usually 0 or 1) that is
colas@0
  1185
used when you request range information.  The base defaults to 0 so
colas@0
  1186
that range information is returned as array indices.  You can set the
colas@0
  1187
base to 1 if you want to report traditional line numbers instead.
colas@0
  1188
colas@0
  1189
=item C<Min>
colas@0
  1190
colas@0
  1191
    $min1 = $diff->Min(1);
colas@0
  1192
    $min = $diff->Min( $seqNum, $base );
colas@0
  1193
colas@0
  1194
C<Min> returns the first value that C<Range> would return (given the
colas@0
  1195
same arguments) or returns C<undef> if C<Range> would return an empty
colas@0
  1196
list.
colas@0
  1197
colas@0
  1198
=item C<Max>
colas@0
  1199
colas@0
  1200
C<Max> returns the last value that C<Range> would return or C<undef>.
colas@0
  1201
colas@0
  1202
=item C<Get>
colas@0
  1203
colas@0
  1204
    ( $n, $x, $r ) = $diff->Get(qw( min1 max1 range1 ));
colas@0
  1205
    @values = $diff->Get(qw( 0min2 1max2 range2 same base ));
colas@0
  1206
colas@0
  1207
C<Get> returns one or more scalar values.  You pass in a list of the
colas@0
  1208
names of the values you want returned.  Each name must match one of the
colas@0
  1209
following regexes:
colas@0
  1210
colas@0
  1211
    /^(-?\d+)?(min|max)[12]$/i
colas@0
  1212
    /^(range[12]|same|diff|base)$/i
colas@0
  1213
colas@0
  1214
The 1 or 2 after a name says which sequence you want the information
colas@0
  1215
for (and where allowed, it is required).  The optional number before
colas@0
  1216
"min" or "max" is the base to use.  So the following equalities hold:
colas@0
  1217
colas@0
  1218
    $diff->Get('min1') == $diff->Min(1)
colas@0
  1219
    $diff->Get('0min2') == $diff->Min(2,0)
colas@0
  1220
colas@0
  1221
Using C<Get> in a scalar context when you've passed in more than one
colas@0
  1222
name is a fatal error (C<die> is called).
colas@0
  1223
colas@0
  1224
=back
colas@0
  1225
colas@0
  1226
=head2 C<prepare>
colas@0
  1227
colas@0
  1228
Given a reference to a list of items, C<prepare> returns a reference
colas@0
  1229
to a hash which can be used when comparing this sequence to other
colas@0
  1230
sequences with C<LCS> or C<LCS_length>.
colas@0
  1231
colas@0
  1232
    $prep = prepare( \@seq1 );
colas@0
  1233
    for $i ( 0 .. 10_000 )
colas@0
  1234
    {
colas@0
  1235
        @lcs = LCS( $prep, $seq[$i] );
colas@0
  1236
        # do something useful with @lcs
colas@0
  1237
    }
colas@0
  1238
colas@0
  1239
C<prepare> may be passed an optional third parameter; this is a CODE
colas@0
  1240
reference to a key generation function.  See L</KEY GENERATION
colas@0
  1241
FUNCTIONS>.
colas@0
  1242
colas@0
  1243
    $prep = prepare( \@seq1, \&keyGen );
colas@0
  1244
    for $i ( 0 .. 10_000 )
colas@0
  1245
    {
colas@0
  1246
        @lcs = LCS( $seq[$i], $prep, \&keyGen );
colas@0
  1247
        # do something useful with @lcs
colas@0
  1248
    }
colas@0
  1249
colas@0
  1250
Using C<prepare> provides a performance gain of about 50% when calling LCS
colas@0
  1251
many times compared with not preparing.
colas@0
  1252
colas@0
  1253
=head2 C<diff>
colas@0
  1254
colas@0
  1255
    @diffs     = diff( \@seq1, \@seq2 );
colas@0
  1256
    $diffs_ref = diff( \@seq1, \@seq2 );
colas@0
  1257
colas@0
  1258
C<diff> computes the smallest set of additions and deletions necessary
colas@0
  1259
to turn the first sequence into the second, and returns a description
colas@0
  1260
of these changes.  The description is a list of I<hunks>; each hunk
colas@0
  1261
represents a contiguous section of items which should be added,
colas@0
  1262
deleted, or replaced.  (Hunks containing unchanged items are not
colas@0
  1263
included.)
colas@0
  1264
colas@0
  1265
The return value of C<diff> is a list of hunks, or, in scalar context, a
colas@0
  1266
reference to such a list.  If there are no differences, the list will be
colas@0
  1267
empty.
colas@0
  1268
colas@0
  1269
Here is an example.  Calling C<diff> for the following two sequences:
colas@0
  1270
colas@0
  1271
    a b c e h j l m n p
colas@0
  1272
    b c d e f j k l m r s t
colas@0
  1273
colas@0
  1274
would produce the following list:
colas@0
  1275
colas@0
  1276
    (
colas@0
  1277
      [ [ '-', 0, 'a' ] ],
colas@0
  1278
colas@0
  1279
      [ [ '+', 2, 'd' ] ],
colas@0
  1280
colas@0
  1281
      [ [ '-', 4, 'h' ],
colas@0
  1282
        [ '+', 4, 'f' ] ],
colas@0
  1283
colas@0
  1284
      [ [ '+', 6, 'k' ] ],
colas@0
  1285
colas@0
  1286
      [ [ '-',  8, 'n' ],
colas@0
  1287
        [ '-',  9, 'p' ],
colas@0
  1288
        [ '+',  9, 'r' ],
colas@0
  1289
        [ '+', 10, 's' ],
colas@0
  1290
        [ '+', 11, 't' ] ],
colas@0
  1291
    )
colas@0
  1292
colas@0
  1293
There are five hunks here.  The first hunk says that the C<a> at
colas@0
  1294
position 0 of the first sequence should be deleted (C<->).  The second
colas@0
  1295
hunk says that the C<d> at position 2 of the second sequence should
colas@0
  1296
be inserted (C<+>).  The third hunk says that the C<h> at position 4
colas@0
  1297
of the first sequence should be removed and replaced with the C<f>
colas@0
  1298
from position 4 of the second sequence.  And so on.
colas@0
  1299
colas@0
  1300
C<diff> may be passed an optional third parameter; this is a CODE
colas@0
  1301
reference to a key generation function.  See L</KEY GENERATION
colas@0
  1302
FUNCTIONS>.
colas@0
  1303
colas@0
  1304
Additional parameters, if any, will be passed to the key generation
colas@0
  1305
routine.
colas@0
  1306
colas@0
  1307
=head2 C<sdiff>
colas@0
  1308
colas@0
  1309
    @sdiffs     = sdiff( \@seq1, \@seq2 );
colas@0
  1310
    $sdiffs_ref = sdiff( \@seq1, \@seq2 );
colas@0
  1311
colas@0
  1312
C<sdiff> computes all necessary components to show two sequences
colas@0
  1313
and their minimized differences side by side, just like the
colas@0
  1314
Unix-utility I<sdiff> does:
colas@0
  1315
colas@0
  1316
    same             same
colas@0
  1317
    before     |     after
colas@0
  1318
    old        <     -
colas@0
  1319
    -          >     new
colas@0
  1320
colas@0
  1321
It returns a list of array refs, each pointing to an array of
colas@0
  1322
display instructions. In scalar context it returns a reference
colas@0
  1323
to such a list. If there are no differences, the list will have one
colas@0
  1324
entry per item, each indicating that the item was unchanged.
colas@0
  1325
colas@0
  1326
Display instructions consist of three elements: A modifier indicator
colas@0
  1327
(C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
colas@0
  1328
C<c>: Element changed) and the value of the old and new elements, to
colas@0
  1329
be displayed side-by-side.
colas@0
  1330
colas@0
  1331
An C<sdiff> of the following two sequences:
colas@0
  1332
colas@0
  1333
    a b c e h j l m n p
colas@0
  1334
    b c d e f j k l m r s t
colas@0
  1335
colas@0
  1336
results in
colas@0
  1337
colas@0
  1338
    ( [ '-', 'a', ''  ],
colas@0
  1339
      [ 'u', 'b', 'b' ],
colas@0
  1340
      [ 'u', 'c', 'c' ],
colas@0
  1341
      [ '+', '',  'd' ],
colas@0
  1342
      [ 'u', 'e', 'e' ],
colas@0
  1343
      [ 'c', 'h', 'f' ],
colas@0
  1344
      [ 'u', 'j', 'j' ],
colas@0
  1345
      [ '+', '',  'k' ],
colas@0
  1346
      [ 'u', 'l', 'l' ],
colas@0
  1347
      [ 'u', 'm', 'm' ],
colas@0
  1348
      [ 'c', 'n', 'r' ],
colas@0
  1349
      [ 'c', 'p', 's' ],
colas@0
  1350
      [ '+', '',  't' ],
colas@0
  1351
    )
colas@0
  1352
colas@0
  1353
C<sdiff> may be passed an optional third parameter; this is a CODE
colas@0
  1354
reference to a key generation function.  See L</KEY GENERATION
colas@0
  1355
FUNCTIONS>.
colas@0
  1356
colas@0
  1357
Additional parameters, if any, will be passed to the key generation
colas@0
  1358
routine.
colas@0
  1359
colas@0
  1360
=head2 C<compact_diff>
colas@0
  1361
colas@0
  1362
C<compact_diff> is much like C<sdiff> except it returns a much more
colas@0
  1363
compact description consisting of just one flat list of indices.  An
colas@0
  1364
example helps explain the format:
colas@0
  1365
colas@0
  1366
    my @a = qw( a b c   e  h j   l m n p      );
colas@0
  1367
    my @b = qw(   b c d e f  j k l m    r s t );
colas@0
  1368
    @cdiff = compact_diff( \@a, \@b );
colas@0
  1369
    # Returns:
colas@0
  1370
    #   @a      @b       @a       @b
colas@0
  1371
    #  start   start   values   values
colas@0
  1372
    (    0,      0,   #       =
colas@0
  1373
         0,      0,   #    a  !
colas@0
  1374
         1,      0,   #  b c  =  b c
colas@0
  1375
         3,      2,   #       !  d
colas@0
  1376
         3,      3,   #    e  =  e
colas@0
  1377
         4,      4,   #    f  !  h
colas@0
  1378
         5,      5,   #    j  =  j
colas@0
  1379
         6,      6,   #       !  k
colas@0
  1380
         6,      7,   #  l m  =  l m
colas@0
  1381
         8,      9,   #  n p  !  r s t
colas@0
  1382
        10,     12,   #
colas@0
  1383
    );
colas@0
  1384
colas@0
  1385
The 0th, 2nd, 4th, etc. entries are all indices into @seq1 (@a in the
colas@0
  1386
above example) indicating where a hunk begins.  The 1st, 3rd, 5th, etc.
colas@0
  1387
entries are all indices into @seq2 (@b in the above example) indicating
colas@0
  1388
where the same hunk begins.
colas@0
  1389
colas@0
  1390
So each pair of indices (except the last pair) describes where a hunk
colas@0
  1391
begins (in each sequence).  Since each hunk must end at the item just
colas@0
  1392
before the item that starts the next hunk, the next pair of indices can
colas@0
  1393
be used to determine where the hunk ends.
colas@0
  1394
colas@0
  1395
So, the first 4 entries (0..3) describe the first hunk.  Entries 0 and 1
colas@0
  1396
describe where the first hunk begins (and so are always both 0).
colas@0
  1397
Entries 2 and 3 describe where the next hunk begins, so subtracting 1
colas@0
  1398
from each tells us where the first hunk ends.  That is, the first hunk
colas@0
  1399
contains items C<$diff[0]> through C<$diff[2] - 1> of the first sequence
colas@0
  1400
and contains items C<$diff[1]> through C<$diff[3] - 1> of the second
colas@0
  1401
sequence.
colas@0
  1402
colas@0
  1403
In other words, the first hunk consists of the following two lists of items:
colas@0
  1404
colas@0
  1405
               #  1st pair     2nd pair
colas@0
  1406
               # of indices   of indices
colas@0
  1407
    @list1 = @a[ $cdiff[0] .. $cdiff[2]-1 ];
colas@0
  1408
    @list2 = @b[ $cdiff[1] .. $cdiff[3]-1 ];
colas@0
  1409
               # Hunk start   Hunk end
colas@0
  1410
colas@0
  1411
Note that the hunks will always alternate between those that are part of
colas@0
  1412
the LCS (those that contain unchanged items) and those that contain
colas@0
  1413
changes.  This means that all we need to be told is whether the first
colas@0
  1414
hunk is a 'same' or 'diff' hunk and we can determine which of the other
colas@0
  1415
hunks contain 'same' items or 'diff' items.
colas@0
  1416
colas@0
  1417
By convention, we always make the first hunk contain unchanged items.
colas@0
  1418
So the 1st, 3rd, 5th, etc. hunks (all odd-numbered hunks if you start
colas@0
  1419
counting from 1) all contain unchanged items.  And the 2nd, 4th, 6th,
colas@0
  1420
etc. hunks (all even-numbered hunks if you start counting from 1) all
colas@0
  1421
contain changed items.
colas@0
  1422
colas@0
  1423
Since @a and @b don't begin with the same value, the first hunk in our
colas@0
  1424
example is empty (otherwise we'd violate the above convention).  Note
colas@0
  1425
that the first 4 index values in our example are all zero.  Plug these
colas@0
  1426
values into our previous code block and we get:
colas@0
  1427
colas@0
  1428
    @hunk1a = @a[ 0 .. 0-1 ];
colas@0
  1429
    @hunk1b = @b[ 0 .. 0-1 ];
colas@0
  1430
colas@0
  1431
And C<0..-1> returns the empty list.
colas@0
  1432
colas@0
  1433
Move down one pair of indices (2..5) and we get the offset ranges for
colas@0
  1434
the second hunk, which contains changed items.
colas@0
  1435
colas@0
  1436
Since C<@diff[2..5]> contains (0,0,1,0) in our example, the second hunk
colas@0
  1437
consists of these two lists of items:
colas@0
  1438
colas@0
  1439
        @hunk2a = @a[ $cdiff[2] .. $cdiff[4]-1 ];
colas@0
  1440
        @hunk2b = @b[ $cdiff[3] .. $cdiff[5]-1 ];
colas@0
  1441
    # or
colas@0
  1442
        @hunk2a = @a[ 0 .. 1-1 ];
colas@0
  1443
        @hunk2b = @b[ 0 .. 0-1 ];
colas@0
  1444
    # or
colas@0
  1445
        @hunk2a = @a[ 0 .. 0 ];
colas@0
  1446
        @hunk2b = @b[ 0 .. -1 ];
colas@0
  1447
    # or
colas@0
  1448
        @hunk2a = ( 'a' );
colas@0
  1449
        @hunk2b = ( );
colas@0
  1450
colas@0
  1451
That is, we would delete item 0 ('a') from @a.
colas@0
  1452
colas@0
  1453
Since C<@diff[4..7]> contains (1,0,3,2) in our example, the third hunk
colas@0
  1454
consists of these two lists of items:
colas@0
  1455
colas@0
  1456
        @hunk3a = @a[ $cdiff[4] .. $cdiff[6]-1 ];
colas@0
  1457
        @hunk3a = @b[ $cdiff[5] .. $cdiff[7]-1 ];
colas@0
  1458
    # or
colas@0
  1459
        @hunk3a = @a[ 1 .. 3-1 ];
colas@0
  1460
        @hunk3a = @b[ 0 .. 2-1 ];
colas@0
  1461
    # or
colas@0
  1462
        @hunk3a = @a[ 1 .. 2 ];
colas@0
  1463
        @hunk3a = @b[ 0 .. 1 ];
colas@0
  1464
    # or
colas@0
  1465
        @hunk3a = qw( b c );
colas@0
  1466
        @hunk3a = qw( b c );
colas@0
  1467
colas@0
  1468
Note that this third hunk contains unchanged items as our convention demands.
colas@0
  1469
colas@0
  1470
You can continue this process until you reach the last two indices,
colas@0
  1471
which will always be the number of items in each sequence.  This is
colas@0
  1472
required so that subtracting one from each will give you the indices to
colas@0
  1473
the last items in each sequence.
colas@0
  1474
colas@0
  1475
=head2 C<traverse_sequences>
colas@0
  1476
colas@0
  1477
C<traverse_sequences> used to be the most general facility provided by
colas@0
  1478
this module (the new OO interface is more powerful and much easier to
colas@0
  1479
use).
colas@0
  1480
colas@0
  1481
Imagine that there are two arrows.  Arrow A points to an element of
colas@0
  1482
sequence A, and arrow B points to an element of the sequence B. 
colas@0
  1483
Initially, the arrows point to the first elements of the respective
colas@0
  1484
sequences.  C<traverse_sequences> will advance the arrows through the
colas@0
  1485
sequences one element at a time, calling an appropriate user-specified
colas@0
  1486
callback function before each advance.  It willadvance the arrows in
colas@0
  1487
such a way that if there are equal elements C<$A[$i]> and C<$B[$j]>
colas@0
  1488
which are equal and which are part of the LCS, there will be some moment
colas@0
  1489
during the execution of C<traverse_sequences> when arrow A is pointing
colas@0
  1490
to C<$A[$i]> and arrow B is pointing to C<$B[$j]>.  When this happens,
colas@0
  1491
C<traverse_sequences> will call the C<MATCH> callback function and then
colas@0
  1492
it will advance both arrows.
colas@0
  1493
colas@0
  1494
Otherwise, one of the arrows is pointing to an element of its sequence
colas@0
  1495
that is not part of the LCS.  C<traverse_sequences> will advance that
colas@0
  1496
arrow and will call the C<DISCARD_A> or the C<DISCARD_B> callback,
colas@0
  1497
depending on which arrow it advanced.  If both arrows point to elements
colas@0
  1498
that are not part of the LCS, then C<traverse_sequences> will advance
colas@0
  1499
one of them and call the appropriate callback, but it is not specified
colas@0
  1500
which it will call.
colas@0
  1501
colas@0
  1502
The arguments to C<traverse_sequences> are the two sequences to
colas@0
  1503
traverse, and a hash which specifies the callback functions, like this:
colas@0
  1504
colas@0
  1505
    traverse_sequences(
colas@0
  1506
        \@seq1, \@seq2,
colas@0
  1507
        {   MATCH => $callback_1,
colas@0
  1508
            DISCARD_A => $callback_2,
colas@0
  1509
            DISCARD_B => $callback_3,
colas@0
  1510
        }
colas@0
  1511
    );
colas@0
  1512
colas@0
  1513
Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least
colas@0
  1514
the indices of the two arrows as their arguments.  They are not expected
colas@0
  1515
to return any values.  If a callback is omitted from the table, it is
colas@0
  1516
not called.
colas@0
  1517
colas@0
  1518
Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
colas@0
  1519
corresponding index in A or B.
colas@0
  1520
colas@0
  1521
If arrow A reaches the end of its sequence, before arrow B does,
colas@0
  1522
C<traverse_sequences> will call the C<A_FINISHED> callback when it
colas@0
  1523
advances arrow B, if there is such a function; if not it will call
colas@0
  1524
C<DISCARD_B> instead.  Similarly if arrow B finishes first. 
colas@0
  1525
C<traverse_sequences> returns when both arrows are at the ends of their
colas@0
  1526
respective sequences.  It returns true on success and false on failure. 
colas@0
  1527
At present there is no way to fail.
colas@0
  1528
colas@0
  1529
C<traverse_sequences> may be passed an optional fourth parameter; this
colas@0
  1530
is a CODE reference to a key generation function.  See L</KEY GENERATION
colas@0
  1531
FUNCTIONS>.
colas@0
  1532
colas@0
  1533
Additional parameters, if any, will be passed to the key generation function.
colas@0
  1534
colas@0
  1535
If you want to pass additional parameters to your callbacks, but don't
colas@0
  1536
need a custom key generation function, you can get the default by
colas@0
  1537
passing undef:
colas@0
  1538
colas@0
  1539
    traverse_sequences(
colas@0
  1540
        \@seq1, \@seq2,
colas@0
  1541
        {   MATCH => $callback_1,
colas@0
  1542
            DISCARD_A => $callback_2,
colas@0
  1543
            DISCARD_B => $callback_3,
colas@0
  1544
        },
colas@0
  1545
        undef,     # default key-gen
colas@0
  1546
        $myArgument1,
colas@0
  1547
        $myArgument2,
colas@0
  1548
        $myArgument3,
colas@0
  1549
    );
colas@0
  1550
colas@0
  1551
C<traverse_sequences> does not have a useful return value; you are
colas@0
  1552
expected to plug in the appropriate behavior with the callback
colas@0
  1553
functions.
colas@0
  1554
colas@0
  1555
=head2 C<traverse_balanced>
colas@0
  1556
colas@0
  1557
C<traverse_balanced> is an alternative to C<traverse_sequences>. It
colas@0
  1558
uses a different algorithm to iterate through the entries in the
colas@0
  1559
computed LCS. Instead of sticking to one side and showing element changes
colas@0
  1560
as insertions and deletions only, it will jump back and forth between
colas@0
  1561
the two sequences and report I<changes> occurring as deletions on one
colas@0
  1562
side followed immediatly by an insertion on the other side.
colas@0
  1563
colas@0
  1564
In addition to the C<DISCARD_A>, C<DISCARD_B>, and C<MATCH> callbacks
colas@0
  1565
supported by C<traverse_sequences>, C<traverse_balanced> supports
colas@0
  1566
a C<CHANGE> callback indicating that one element got C<replaced> by another:
colas@0
  1567
colas@0
  1568
    traverse_balanced(
colas@0
  1569
        \@seq1, \@seq2,
colas@0
  1570
        {   MATCH => $callback_1,
colas@0
  1571
            DISCARD_A => $callback_2,
colas@0
  1572
            DISCARD_B => $callback_3,
colas@0
  1573
            CHANGE    => $callback_4,
colas@0
  1574
        }
colas@0
  1575
    );
colas@0
  1576
colas@0
  1577
If no C<CHANGE> callback is specified, C<traverse_balanced>
colas@0
  1578
will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
colas@0
  1579
therefore resulting in a similar behaviour as C<traverse_sequences>
colas@0
  1580
with different order of events.
colas@0
  1581
colas@0
  1582
C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
colas@0
  1583
noticable only while processing huge amounts of data.
colas@0
  1584
colas@0
  1585
The C<sdiff> function of this module 
colas@0
  1586
is implemented as call to C<traverse_balanced>.
colas@0
  1587
colas@0
  1588
C<traverse_balanced> does not have a useful return value; you are expected to
colas@0
  1589
plug in the appropriate behavior with the callback functions.
colas@0
  1590
colas@0
  1591
=head1 KEY GENERATION FUNCTIONS
colas@0
  1592
colas@0
  1593
Most of the functions accept an optional extra parameter.  This is a
colas@0
  1594
CODE reference to a key generating (hashing) function that should return
colas@0
  1595
a string that uniquely identifies a given element.  It should be the
colas@0
  1596
case that if two elements are to be considered equal, their keys should
colas@0
  1597
be the same (and the other way around).  If no key generation function
colas@0
  1598
is provided, the key will be the element as a string.
colas@0
  1599
colas@0
  1600
By default, comparisons will use "eq" and elements will be turned into keys
colas@0
  1601
using the default stringizing operator '""'.
colas@0
  1602
colas@0
  1603
Where this is important is when you're comparing something other than
colas@0
  1604
strings.  If it is the case that you have multiple different objects
colas@0
  1605
that should be considered to be equal, you should supply a key
colas@0
  1606
generation function. Otherwise, you have to make sure that your arrays
colas@0
  1607
contain unique references.
colas@0
  1608
colas@0
  1609
For instance, consider this example:
colas@0
  1610
colas@0
  1611
    package Person;
colas@0
  1612
colas@0
  1613
    sub new
colas@0
  1614
    {
colas@0
  1615
        my $package = shift;
colas@0
  1616
        return bless { name => '', ssn => '', @_ }, $package;
colas@0
  1617
    }
colas@0
  1618
colas@0
  1619
    sub clone
colas@0
  1620
    {
colas@0
  1621
        my $old = shift;
colas@0
  1622
        my $new = bless { %$old }, ref($old);
colas@0
  1623
    }
colas@0
  1624
colas@0
  1625
    sub hash
colas@0
  1626
    {
colas@0
  1627
        return shift()->{'ssn'};
colas@0
  1628
    }
colas@0
  1629
colas@0
  1630
    my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
colas@0
  1631
    my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
colas@0
  1632
    my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
colas@0
  1633
    my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
colas@0
  1634
    my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
colas@0
  1635
colas@0
  1636
If you did this:
colas@0
  1637
colas@0
  1638
    my $array1 = [ $person1, $person2, $person4 ];
colas@0
  1639
    my $array2 = [ $person1, $person3, $person4, $person5 ];
colas@0
  1640
    Algorithm::Diff::diff( $array1, $array2 );
colas@0
  1641
colas@0
  1642
everything would work out OK (each of the objects would be converted
colas@0
  1643
into a string like "Person=HASH(0x82425b0)" for comparison).
colas@0
  1644
colas@0
  1645
But if you did this:
colas@0
  1646
colas@0
  1647
    my $array1 = [ $person1, $person2, $person4 ];
colas@0
  1648
    my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
colas@0
  1649
    Algorithm::Diff::diff( $array1, $array2 );
colas@0
  1650
colas@0
  1651
$person4 and $person4->clone() (which have the same name and SSN)
colas@0
  1652
would be seen as different objects. If you wanted them to be considered
colas@0
  1653
equivalent, you would have to pass in a key generation function:
colas@0
  1654
colas@0
  1655
    my $array1 = [ $person1, $person2, $person4 ];
colas@0
  1656
    my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
colas@0
  1657
    Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
colas@0
  1658
colas@0
  1659
This would use the 'ssn' field in each Person as a comparison key, and
colas@0
  1660
so would consider $person4 and $person4->clone() as equal.
colas@0
  1661
colas@0
  1662
You may also pass additional parameters to the key generation function
colas@0
  1663
if you wish.
colas@0
  1664
colas@0
  1665
=head1 ERROR CHECKING
colas@0
  1666
colas@0
  1667
If you pass these routines a non-reference and they expect a reference,
colas@0
  1668
they will die with a message.
colas@0
  1669
colas@0
  1670
=head1 AUTHOR
colas@0
  1671
colas@0
  1672
This version released by Tye McQueen (http://perlmonks.org/?node=tye).
colas@0
  1673
colas@0
  1674
=head1 LICENSE
colas@0
  1675
colas@0
  1676
Parts Copyright (c) 2000-2004 Ned Konz.  All rights reserved.
colas@0
  1677
Parts by Tye McQueen.
colas@0
  1678
colas@0
  1679
This program is free software; you can redistribute it and/or modify it
colas@0
  1680
under the same terms as Perl.
colas@0
  1681
colas@0
  1682
=head1 MAILING LIST
colas@0
  1683
colas@0
  1684
Mark-Jason still maintains a mailing list.  To join a low-volume mailing
colas@0
  1685
list for announcements related to diff and Algorithm::Diff, send an
colas@0
  1686
empty mail message to mjd-perl-diff-request@plover.com.
colas@0
  1687
colas@0
  1688
=head1 CREDITS
colas@0
  1689
colas@0
  1690
Versions through 0.59 (and much of this documentation) were written by:
colas@0
  1691
colas@0
  1692
Mark-Jason Dominus, mjd-perl-diff@plover.com
colas@0
  1693
colas@0
  1694
This version borrows some documentation and routine names from
colas@0
  1695
Mark-Jason's, but Diff.pm's code was completely replaced.
colas@0
  1696
colas@0
  1697
This code was adapted from the Smalltalk code of Mario Wolczko
colas@0
  1698
<mario@wolczko.com>, which is available at
colas@0
  1699
ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
colas@0
  1700
colas@0
  1701
C<sdiff> and C<traverse_balanced> were written by Mike Schilli
colas@0
  1702
<m@perlmeister.com>.
colas@0
  1703
colas@0
  1704
The algorithm is that described in
colas@0
  1705
I<A Fast Algorithm for Computing Longest Common Subsequences>,
colas@0
  1706
CACM, vol.20, no.5, pp.350-353, May 1977, with a few
colas@0
  1707
minor improvements to improve the speed.
colas@0
  1708
colas@0
  1709
Much work was done by Ned Konz (perl@bike-nomad.com).
colas@0
  1710
colas@0
  1711
The OO interface and some other changes are by Tye McQueen.
colas@0
  1712
colas@0
  1713
=cut