lib/CPAN/lib/Algorithm/Diff.pm
author Colas Nahaboo <colas@nahaboo.net>
Mon, 11 Aug 2008 20:33:37 +0200
changeset 2 7bc60a767fa4
parent 0 414e01d06fd5
permissions -rw-r--r--
TWiki-4.2.2 Release
     1 package Algorithm::Diff;
     2 # Skip to first "=head" line for documentation.
     3 use strict;
     4 
     5 use integer;    # see below in _replaceNextLargerWith() for mod to make
     6                 # if you don't use this
     7 use vars qw( $VERSION @EXPORT_OK );
     8 $VERSION = 1.19_01;
     9 #          ^ ^^ ^^-- Incremented at will
    10 #          | \+----- Incremented for non-trivial changes to features
    11 #          \-------- Incremented for fundamental changes
    12 require Exporter;
    13 *import    = \&Exporter::import;
    14 @EXPORT_OK = qw(
    15     prepare LCS LCDidx LCS_length
    16     diff sdiff compact_diff
    17     traverse_sequences traverse_balanced
    18 );
    19 
    20 # McIlroy-Hunt diff algorithm
    21 # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
    22 # by Ned Konz, perl@bike-nomad.com
    23 # Updates by Tye McQueen, http://perlmonks.org/?node=tye
    24 
    25 # Create a hash that maps each element of $aCollection to the set of
    26 # positions it occupies in $aCollection, restricted to the elements
    27 # within the range of indexes specified by $start and $end.
    28 # The fourth parameter is a subroutine reference that will be called to
    29 # generate a string to use as a key.
    30 # Additional parameters, if any, will be passed to this subroutine.
    31 #
    32 # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
    33 
    34 sub _withPositionsOfInInterval
    35 {
    36     my $aCollection = shift;    # array ref
    37     my $start       = shift;
    38     my $end         = shift;
    39     my $keyGen      = shift;
    40     my %d;
    41     my $index;
    42     for ( $index = $start ; $index <= $end ; $index++ )
    43     {
    44         my $element = $aCollection->[$index];
    45         my $key = &$keyGen( $element, @_ );
    46         if ( exists( $d{$key} ) )
    47         {
    48             unshift ( @{ $d{$key} }, $index );
    49         }
    50         else
    51         {
    52             $d{$key} = [$index];
    53         }
    54     }
    55     return wantarray ? %d : \%d;
    56 }
    57 
    58 # Find the place at which aValue would normally be inserted into the
    59 # array. If that place is already occupied by aValue, do nothing, and
    60 # return undef. If the place does not exist (i.e., it is off the end of
    61 # the array), add it to the end, otherwise replace the element at that
    62 # point with aValue.  It is assumed that the array's values are numeric.
    63 # This is where the bulk (75%) of the time is spent in this module, so
    64 # try to make it fast!
    65 
    66 sub _replaceNextLargerWith
    67 {
    68     my ( $array, $aValue, $high ) = @_;
    69     $high ||= $#$array;
    70 
    71     # off the end?
    72     if ( $high == -1 || $aValue > $array->[-1] )
    73     {
    74         push ( @$array, $aValue );
    75         return $high + 1;
    76     }
    77 
    78     # binary search for insertion point...
    79     my $low = 0;
    80     my $index;
    81     my $found;
    82     while ( $low <= $high )
    83     {
    84         $index = ( $high + $low ) / 2;
    85 
    86         # $index = int(( $high + $low ) / 2);  # without 'use integer'
    87         $found = $array->[$index];
    88 
    89         if ( $aValue == $found )
    90         {
    91             return undef;
    92         }
    93         elsif ( $aValue > $found )
    94         {
    95             $low = $index + 1;
    96         }
    97         else
    98         {
    99             $high = $index - 1;
   100         }
   101     }
   102 
   103     # now insertion point is in $low.
   104     $array->[$low] = $aValue;    # overwrite next larger
   105     return $low;
   106 }
   107 
   108 # This method computes the longest common subsequence in $a and $b.
   109 
   110 # Result is array or ref, whose contents is such that
   111 #   $a->[ $i ] == $b->[ $result[ $i ] ]
   112 # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
   113 
   114 # An additional argument may be passed; this is a hash or key generating
   115 # function that should return a string that uniquely identifies the given
   116 # element.  It should be the case that if the key is the same, the elements
   117 # will compare the same. If this parameter is undef or missing, the key
   118 # will be the element as a string.
   119 
   120 # By default, comparisons will use "eq" and elements will be turned into keys
   121 # using the default stringizing operator '""'.
   122 
   123 # Additional parameters, if any, will be passed to the key generation
   124 # routine.
   125 
   126 sub _longestCommonSubsequence
   127 {
   128     my $a        = shift;    # array ref or hash ref
   129     my $b        = shift;    # array ref or hash ref
   130     my $counting = shift;    # scalar
   131     my $keyGen   = shift;    # code ref
   132     my $compare;             # code ref
   133 
   134     if ( ref($a) eq 'HASH' )
   135     {                        # prepared hash must be in $b
   136         my $tmp = $b;
   137         $b = $a;
   138         $a = $tmp;
   139     }
   140 
   141     # Check for bogus (non-ref) argument values
   142     if ( !ref($a) || !ref($b) )
   143     {
   144         my @callerInfo = caller(1);
   145         die 'error: must pass array or hash references to ' . $callerInfo[3];
   146     }
   147 
   148     # set up code refs
   149     # Note that these are optimized.
   150     if ( !defined($keyGen) )    # optimize for strings
   151     {
   152         $keyGen = sub { $_[0] };
   153         $compare = sub { my ( $a, $b ) = @_; $a eq $b };
   154     }
   155     else
   156     {
   157         $compare = sub {
   158             my $a = shift;
   159             my $b = shift;
   160             &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
   161         };
   162     }
   163 
   164     my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
   165     my ( $prunedCount, $bMatches ) = ( 0, {} );
   166 
   167     if ( ref($b) eq 'HASH' )    # was $bMatches prepared for us?
   168     {
   169         $bMatches = $b;
   170     }
   171     else
   172     {
   173         my ( $bStart, $bFinish ) = ( 0, $#$b );
   174 
   175         # First we prune off any common elements at the beginning
   176         while ( $aStart <= $aFinish
   177             and $bStart <= $bFinish
   178             and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
   179         {
   180             $matchVector->[ $aStart++ ] = $bStart++;
   181             $prunedCount++;
   182         }
   183 
   184         # now the end
   185         while ( $aStart <= $aFinish
   186             and $bStart <= $bFinish
   187             and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
   188         {
   189             $matchVector->[ $aFinish-- ] = $bFinish--;
   190             $prunedCount++;
   191         }
   192 
   193         # Now compute the equivalence classes of positions of elements
   194         $bMatches =
   195           _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
   196     }
   197     my $thresh = [];
   198     my $links  = [];
   199 
   200     my ( $i, $ai, $j, $k );
   201     for ( $i = $aStart ; $i <= $aFinish ; $i++ )
   202     {
   203         $ai = &$keyGen( $a->[$i], @_ );
   204         if ( exists( $bMatches->{$ai} ) )
   205         {
   206             $k = 0;
   207             for $j ( @{ $bMatches->{$ai} } )
   208             {
   209 
   210                 # optimization: most of the time this will be true
   211                 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
   212                 {
   213                     $thresh->[$k] = $j;
   214                 }
   215                 else
   216                 {
   217                     $k = _replaceNextLargerWith( $thresh, $j, $k );
   218                 }
   219 
   220                 # oddly, it's faster to always test this (CPU cache?).
   221                 if ( defined($k) )
   222                 {
   223                     $links->[$k] =
   224                       [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
   225                 }
   226             }
   227         }
   228     }
   229 
   230     if (@$thresh)
   231     {
   232         return $prunedCount + @$thresh if $counting;
   233         for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
   234         {
   235             $matchVector->[ $link->[1] ] = $link->[2];
   236         }
   237     }
   238     elsif ($counting)
   239     {
   240         return $prunedCount;
   241     }
   242 
   243     return wantarray ? @$matchVector : $matchVector;
   244 }
   245 
   246 sub traverse_sequences
   247 {
   248     my $a                 = shift;          # array ref
   249     my $b                 = shift;          # array ref
   250     my $callbacks         = shift || {};
   251     my $keyGen            = shift;
   252     my $matchCallback     = $callbacks->{'MATCH'} || sub { };
   253     my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
   254     my $finishedACallback = $callbacks->{'A_FINISHED'};
   255     my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
   256     my $finishedBCallback = $callbacks->{'B_FINISHED'};
   257     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
   258 
   259     # Process all the lines in @$matchVector
   260     my $lastA = $#$a;
   261     my $lastB = $#$b;
   262     my $bi    = 0;
   263     my $ai;
   264 
   265     for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
   266     {
   267         my $bLine = $matchVector->[$ai];
   268         if ( defined($bLine) )    # matched
   269         {
   270             &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
   271             &$matchCallback( $ai,    $bi++, @_ );
   272         }
   273         else
   274         {
   275             &$discardACallback( $ai, $bi, @_ );
   276         }
   277     }
   278 
   279     # The last entry (if any) processed was a match.
   280     # $ai and $bi point just past the last matching lines in their sequences.
   281 
   282     while ( $ai <= $lastA or $bi <= $lastB )
   283     {
   284 
   285         # last A?
   286         if ( $ai == $lastA + 1 and $bi <= $lastB )
   287         {
   288             if ( defined($finishedACallback) )
   289             {
   290                 &$finishedACallback( $lastA, @_ );
   291                 $finishedACallback = undef;
   292             }
   293             else
   294             {
   295                 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
   296             }
   297         }
   298 
   299         # last B?
   300         if ( $bi == $lastB + 1 and $ai <= $lastA )
   301         {
   302             if ( defined($finishedBCallback) )
   303             {
   304                 &$finishedBCallback( $lastB, @_ );
   305                 $finishedBCallback = undef;
   306             }
   307             else
   308             {
   309                 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
   310             }
   311         }
   312 
   313         &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
   314         &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
   315     }
   316 
   317     return 1;
   318 }
   319 
   320 sub traverse_balanced
   321 {
   322     my $a                 = shift;              # array ref
   323     my $b                 = shift;              # array ref
   324     my $callbacks         = shift || {};
   325     my $keyGen            = shift;
   326     my $matchCallback     = $callbacks->{'MATCH'} || sub { };
   327     my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
   328     my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
   329     my $changeCallback    = $callbacks->{'CHANGE'};
   330     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
   331 
   332     # Process all the lines in match vector
   333     my $lastA = $#$a;
   334     my $lastB = $#$b;
   335     my $bi    = 0;
   336     my $ai    = 0;
   337     my $ma    = -1;
   338     my $mb;
   339 
   340     while (1)
   341     {
   342 
   343         # Find next match indices $ma and $mb
   344         do {
   345             $ma++;
   346         } while(
   347                 $ma <= $#$matchVector
   348             &&  !defined $matchVector->[$ma]
   349         );
   350 
   351         last if $ma > $#$matchVector;    # end of matchVector?
   352         $mb = $matchVector->[$ma];
   353 
   354         # Proceed with discard a/b or change events until
   355         # next match
   356         while ( $ai < $ma || $bi < $mb )
   357         {
   358 
   359             if ( $ai < $ma && $bi < $mb )
   360             {
   361 
   362                 # Change
   363                 if ( defined $changeCallback )
   364                 {
   365                     &$changeCallback( $ai++, $bi++, @_ );
   366                 }
   367                 else
   368                 {
   369                     &$discardACallback( $ai++, $bi, @_ );
   370                     &$discardBCallback( $ai, $bi++, @_ );
   371                 }
   372             }
   373             elsif ( $ai < $ma )
   374             {
   375                 &$discardACallback( $ai++, $bi, @_ );
   376             }
   377             else
   378             {
   379 
   380                 # $bi < $mb
   381                 &$discardBCallback( $ai, $bi++, @_ );
   382             }
   383         }
   384 
   385         # Match
   386         &$matchCallback( $ai++, $bi++, @_ );
   387     }
   388 
   389     while ( $ai <= $lastA || $bi <= $lastB )
   390     {
   391         if ( $ai <= $lastA && $bi <= $lastB )
   392         {
   393 
   394             # Change
   395             if ( defined $changeCallback )
   396             {
   397                 &$changeCallback( $ai++, $bi++, @_ );
   398             }
   399             else
   400             {
   401                 &$discardACallback( $ai++, $bi, @_ );
   402                 &$discardBCallback( $ai, $bi++, @_ );
   403             }
   404         }
   405         elsif ( $ai <= $lastA )
   406         {
   407             &$discardACallback( $ai++, $bi, @_ );
   408         }
   409         else
   410         {
   411 
   412             # $bi <= $lastB
   413             &$discardBCallback( $ai, $bi++, @_ );
   414         }
   415     }
   416 
   417     return 1;
   418 }
   419 
   420 sub prepare
   421 {
   422     my $a       = shift;    # array ref
   423     my $keyGen  = shift;    # code ref
   424 
   425     # set up code ref
   426     $keyGen = sub { $_[0] } unless defined($keyGen);
   427 
   428     return scalar _withPositionsOfInInterval( $a, 0, $#$a, $keyGen, @_ );
   429 }
   430 
   431 sub LCS
   432 {
   433     my $a = shift;                  # array ref
   434     my $b = shift;                  # array ref or hash ref
   435     my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
   436     my @retval;
   437     my $i;
   438     for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
   439     {
   440         if ( defined( $matchVector->[$i] ) )
   441         {
   442             push ( @retval, $a->[$i] );
   443         }
   444     }
   445     return wantarray ? @retval : \@retval;
   446 }
   447 
   448 sub LCS_length
   449 {
   450     my $a = shift;                          # array ref
   451     my $b = shift;                          # array ref or hash ref
   452     return _longestCommonSubsequence( $a, $b, 1, @_ );
   453 }
   454 
   455 sub LCSidx
   456 {
   457     my $a= shift @_;
   458     my $b= shift @_;
   459     my $match= _longestCommonSubsequence( $a, $b, 0, @_ );
   460     my @am= grep defined $match->[$_], 0..$#$match;
   461     my @bm= @{$match}[@am];
   462     return \@am, \@bm;
   463 }
   464 
   465 sub compact_diff
   466 {
   467     my $a= shift @_;
   468     my $b= shift @_;
   469     my( $am, $bm )= LCSidx( $a, $b, @_ );
   470     my @cdiff;
   471     my( $ai, $bi )= ( 0, 0 );
   472     push @cdiff, $ai, $bi;
   473     while( 1 ) {
   474         while(  @$am  &&  $ai == $am->[0]  &&  $bi == $bm->[0]  ) {
   475             shift @$am;
   476             shift @$bm;
   477             ++$ai, ++$bi;
   478         }
   479         push @cdiff, $ai, $bi;
   480         last   if  ! @$am;
   481         $ai = $am->[0];
   482         $bi = $bm->[0];
   483         push @cdiff, $ai, $bi;
   484     }
   485     push @cdiff, 0+@$a, 0+@$b
   486         if  $ai < @$a || $bi < @$b;
   487     return wantarray ? @cdiff : \@cdiff;
   488 }
   489 
   490 sub diff
   491 {
   492     my $a      = shift;    # array ref
   493     my $b      = shift;    # array ref
   494     my $retval = [];
   495     my $hunk   = [];
   496     my $discard = sub {
   497         push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
   498     };
   499     my $add = sub {
   500         push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
   501     };
   502     my $match = sub {
   503         push @$retval, $hunk
   504             if 0 < @$hunk;
   505         $hunk = []
   506     };
   507     traverse_sequences( $a, $b,
   508         { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
   509     &$match();
   510     return wantarray ? @$retval : $retval;
   511 }
   512 
   513 sub sdiff
   514 {
   515     my $a      = shift;    # array ref
   516     my $b      = shift;    # array ref
   517     my $retval = [];
   518     my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
   519     my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
   520     my $change = sub {
   521         push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
   522     };
   523     my $match = sub {
   524         push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
   525     };
   526     traverse_balanced(
   527         $a,
   528         $b,
   529         {
   530             MATCH     => $match,
   531             DISCARD_A => $discard,
   532             DISCARD_B => $add,
   533             CHANGE    => $change,
   534         },
   535         @_
   536     );
   537     return wantarray ? @$retval : $retval;
   538 }
   539 
   540 ########################################
   541 my $Root= __PACKAGE__;
   542 package Algorithm::Diff::_impl;
   543 use strict;
   544 
   545 sub _Idx()  { 0 } # $me->[_Idx]: Ref to array of hunk indices
   546             # 1   # $me->[1]: Ref to first sequence
   547             # 2   # $me->[2]: Ref to second sequence
   548 sub _End()  { 3 } # $me->[_End]: Diff between forward and reverse pos
   549 sub _Same() { 4 } # $me->[_Same]: 1 if pos 1 contains unchanged items
   550 sub _Base() { 5 } # $me->[_Base]: Added to range's min and max
   551 sub _Pos()  { 6 } # $me->[_Pos]: Which hunk is currently selected
   552 sub _Off()  { 7 } # $me->[_Off]: Offset into _Idx for current position
   553 sub _Min() { -2 } # Added to _Off to get min instead of max+1
   554 
   555 sub Die
   556 {
   557     require Carp;
   558     Carp::confess( @_ );
   559 }
   560 
   561 sub _ChkPos
   562 {
   563     my( $me )= @_;
   564     return   if  $me->[_Pos];
   565     my $meth= ( caller(1) )[3];
   566     Die( "Called $meth on 'reset' object" );
   567 }
   568 
   569 sub _ChkSeq
   570 {
   571     my( $me, $seq )= @_;
   572     return $seq + $me->[_Off]
   573         if  1 == $seq  ||  2 == $seq;
   574     my $meth= ( caller(1) )[3];
   575     Die( "$meth: Invalid sequence number ($seq); must be 1 or 2" );
   576 }
   577 
   578 sub getObjPkg
   579 {
   580     my( $us )= @_;
   581     return ref $us   if  ref $us;
   582     return $us . "::_obj";
   583 }
   584 
   585 sub new
   586 {
   587     my( $us, $seq1, $seq2, $opts ) = @_;
   588     my @args;
   589     for( $opts->{keyGen} ) {
   590         push @args, $_   if  $_;
   591     }
   592     for( $opts->{keyGenArgs} ) {
   593         push @args, @$_   if  $_;
   594     }
   595     my $cdif= Algorithm::Diff::compact_diff( $seq1, $seq2, @args );
   596     my $same= 1;
   597     if(  0 == $cdif->[2]  &&  0 == $cdif->[3]  ) {
   598         $same= 0;
   599         splice @$cdif, 0, 2;
   600     }
   601     my @obj= ( $cdif, $seq1, $seq2 );
   602     $obj[_End] = (1+@$cdif)/2;
   603     $obj[_Same] = $same;
   604     $obj[_Base] = 0;
   605     my $me = bless \@obj, $us->getObjPkg();
   606     $me->Reset( 0 );
   607     return $me;
   608 }
   609 
   610 sub Reset
   611 {
   612     my( $me, $pos )= @_;
   613     $pos= int( $pos || 0 );
   614     $pos += $me->[_End]
   615         if  $pos < 0;
   616     $pos= 0
   617         if  $pos < 0  ||  $me->[_End] <= $pos;
   618     $me->[_Pos]= $pos || !1;
   619     $me->[_Off]= 2*$pos - 1;
   620     return $me;
   621 }
   622 
   623 sub Base
   624 {
   625     my( $me, $base )= @_;
   626     my $oldBase= $me->[_Base];
   627     $me->[_Base]= 0+$base   if  defined $base;
   628     return $oldBase;
   629 }
   630 
   631 sub Copy
   632 {
   633     my( $me, $pos, $base )= @_;
   634     my @obj= @$me;
   635     my $you= bless \@obj, ref($me);
   636     $you->Reset( $pos )   if  defined $pos;
   637     $you->Base( $base );
   638     return $you;
   639 }
   640 
   641 sub Next {
   642     my( $me, $steps )= @_;
   643     $steps= 1   if  ! defined $steps;
   644     if( $steps ) {
   645         my $pos= $me->[_Pos];
   646         my $new= $pos + $steps;
   647         $new= 0   if  $pos  &&  $new < 0;
   648         $me->Reset( $new )
   649     }
   650     return $me->[_Pos];
   651 }
   652 
   653 sub Prev {
   654     my( $me, $steps )= @_;
   655     $steps= 1   if  ! defined $steps;
   656     my $pos= $me->Next(-$steps);
   657     $pos -= $me->[_End]   if  $pos;
   658     return $pos;
   659 }
   660 
   661 sub Diff {
   662     my( $me )= @_;
   663     $me->_ChkPos();
   664     return 0   if  $me->[_Same] == ( 1 & $me->[_Pos] );
   665     my $ret= 0;
   666     my $off= $me->[_Off];
   667     for my $seq ( 1, 2 ) {
   668         $ret |= $seq
   669             if  $me->[_Idx][ $off + $seq + _Min ]
   670             <   $me->[_Idx][ $off + $seq ];
   671     }
   672     return $ret;
   673 }
   674 
   675 sub Min {
   676     my( $me, $seq, $base )= @_;
   677     $me->_ChkPos();
   678     my $off= $me->_ChkSeq($seq);
   679     $base= $me->[_Base] if !defined $base;
   680     return $base + $me->[_Idx][ $off + _Min ];
   681 }
   682 
   683 sub Max {
   684     my( $me, $seq, $base )= @_;
   685     $me->_ChkPos();
   686     my $off= $me->_ChkSeq($seq);
   687     $base= $me->[_Base] if !defined $base;
   688     return $base + $me->[_Idx][ $off ] -1;
   689 }
   690 
   691 sub Range {
   692     my( $me, $seq, $base )= @_;
   693     $me->_ChkPos();
   694     my $off = $me->_ChkSeq($seq);
   695     if( !wantarray ) {
   696         return  $me->[_Idx][ $off ]
   697             -   $me->[_Idx][ $off + _Min ];
   698     }
   699     $base= $me->[_Base] if !defined $base;
   700     return  ( $base + $me->[_Idx][ $off + _Min ] )
   701         ..  ( $base + $me->[_Idx][ $off ] - 1 );
   702 }
   703 
   704 sub Items {
   705     my( $me, $seq )= @_;
   706     $me->_ChkPos();
   707     my $off = $me->_ChkSeq($seq);
   708     if( !wantarray ) {
   709         return  $me->[_Idx][ $off ]
   710             -   $me->[_Idx][ $off + _Min ];
   711     }
   712     return
   713         @{$me->[$seq]}[
   714                 $me->[_Idx][ $off + _Min ]
   715             ..  ( $me->[_Idx][ $off ] - 1 )
   716         ];
   717 }
   718 
   719 sub Same {
   720     my( $me )= @_;
   721     $me->_ChkPos();
   722     return wantarray ? () : 0
   723         if  $me->[_Same] != ( 1 & $me->[_Pos] );
   724     return $me->Items(1);
   725 }
   726 
   727 my %getName;
   728 BEGIN {
   729     %getName= (
   730         same => \&Same,
   731         diff => \&Diff,
   732         base => \&Base,
   733         min  => \&Min,
   734         max  => \&Max,
   735         range=> \&Range,
   736         items=> \&Items, # same thing
   737     );
   738 }
   739 
   740 sub Get
   741 {
   742     my $me= shift @_;
   743     $me->_ChkPos();
   744     my @value;
   745     for my $arg (  @_  ) {
   746         for my $word (  split ' ', $arg  ) {
   747             my $meth;
   748             if(     $word !~ /^(-?\d+)?([a-zA-Z]+)([12])?$/
   749                 ||  not  $meth= $getName{ lc $2 }
   750             ) {
   751                 Die( $Root, ", Get: Invalid request ($word)" );
   752             }
   753             my( $base, $name, $seq )= ( $1, $2, $3 );
   754             push @value, scalar(
   755                 4 == length($name)
   756                     ? $meth->( $me )
   757                     : $meth->( $me, $seq, $base )
   758             );
   759         }
   760     }
   761     if(  wantarray  ) {
   762         return @value;
   763     } elsif(  1 == @value  ) {
   764         return $value[0];
   765     }
   766     Die( 0+@value, " values requested from ",
   767         $Root, "'s Get in scalar context" );
   768 }
   769 
   770 
   771 my $Obj= getObjPkg($Root);
   772 no strict 'refs';
   773 
   774 for my $meth (  qw( new getObjPkg )  ) {
   775     *{$Root."::".$meth} = \&{$meth};
   776     *{$Obj ."::".$meth} = \&{$meth};
   777 }
   778 for my $meth (  qw(
   779     Next Prev Reset Copy Base Diff
   780     Same Items Range Min Max Get
   781     _ChkPos _ChkSeq
   782 )  ) {
   783     *{$Obj."::".$meth} = \&{$meth};
   784 }
   785 
   786 1;
   787 __END__
   788 
   789 =head1 NAME
   790 
   791 Algorithm::Diff - Compute `intelligent' differences between two files / lists
   792 
   793 =head1 SYNOPSIS
   794 
   795     require Algorithm::Diff;
   796 
   797     # This example produces traditional 'diff' output:
   798 
   799     my $diff = Algorithm::Diff->new( \@seq1, \@seq2 );
   800 
   801     $diff->Base( 1 );   # Return line numbers, not indices
   802     while(  $diff->Next()  ) {
   803         next   if  $diff->Same();
   804         my $sep = '';
   805         if(  ! $diff->Items(2)  ) {
   806             sprintf "%d,%dd%d\n",
   807                 $diff->Get(qw( Min1 Max1 Max2 ));
   808         } elsif(  ! $diff->Items(1)  ) {
   809             sprint "%da%d,%d\n",
   810                 $diff->Get(qw( Max1 Min2 Max2 ));
   811         } else {
   812             $sep = "---\n";
   813             sprintf "%d,%dc%d,%d\n",
   814                 $diff->Get(qw( Min1 Max1 Min2 Max2 ));
   815         }
   816         print "< $_"   for  $diff->Items(1);
   817         print $sep;
   818         print "> $_"   for  $diff->Items(2);
   819     }
   820 
   821 
   822     # Alternate interfaces:
   823 
   824     use Algorithm::Diff qw(
   825         LCS LCS_length LCSidx
   826         diff sdiff compact_diff
   827         traverse_sequences traverse_balanced );
   828 
   829     @lcs    = LCS( \@seq1, \@seq2 );
   830     $lcsref = LCS( \@seq1, \@seq2 );
   831     $count  = LCS_length( \@seq1, \@seq2 );
   832 
   833     ( $seq1idxref, $seq2idxref ) = LCSidx( \@seq1, \@seq2 );
   834 
   835 
   836     # Complicated interfaces:
   837 
   838     @diffs  = diff( \@seq1, \@seq2 );
   839 
   840     @sdiffs = sdiff( \@seq1, \@seq2 );
   841 
   842     @cdiffs = compact_diff( \@seq1, \@seq2 );
   843 
   844     traverse_sequences(
   845         \@seq1,
   846         \@seq2,
   847         {   MATCH     => \&callback1,
   848             DISCARD_A => \&callback2,
   849             DISCARD_B => \&callback3,
   850         },
   851         \&key_generator,
   852         @extra_args,
   853     );
   854 
   855     traverse_balanced(
   856         \@seq1,
   857         \@seq2,
   858         {   MATCH     => \&callback1,
   859             DISCARD_A => \&callback2,
   860             DISCARD_B => \&callback3,
   861             CHANGE    => \&callback4,
   862         },
   863         \&key_generator,
   864         @extra_args,
   865     );
   866 
   867 
   868 =head1 INTRODUCTION
   869 
   870 (by Mark-Jason Dominus)
   871 
   872 I once read an article written by the authors of C<diff>; they said
   873 that they worked very hard on the algorithm until they found the
   874 right one.
   875 
   876 I think what they ended up using (and I hope someone will correct me,
   877 because I am not very confident about this) was the `longest common
   878 subsequence' method.  In the LCS problem, you have two sequences of
   879 items:
   880 
   881     a b c d f g h j q z
   882 
   883     a b c d e f g i j k r x y z
   884 
   885 and you want to find the longest sequence of items that is present in
   886 both original sequences in the same order.  That is, you want to find
   887 a new sequence I<S> which can be obtained from the first sequence by
   888 deleting some items, and from the secend sequence by deleting other
   889 items.  You also want I<S> to be as long as possible.  In this case I<S>
   890 is
   891 
   892     a b c d f g j z
   893 
   894 From there it's only a small step to get diff-like output:
   895 
   896     e   h i   k   q r x y
   897     +   - +   +   - + + +
   898 
   899 This module solves the LCS problem.  It also includes a canned function
   900 to generate C<diff>-like output.
   901 
   902 It might seem from the example above that the LCS of two sequences is
   903 always pretty obvious, but that's not always the case, especially when
   904 the two sequences have many repeated elements.  For example, consider
   905 
   906     a x b y c z p d q
   907     a b c a x b y c z
   908 
   909 A naive approach might start by matching up the C<a> and C<b> that
   910 appear at the beginning of each sequence, like this:
   911 
   912     a x b y c         z p d q
   913     a   b   c a b y c z
   914 
   915 This finds the common subsequence C<a b c z>.  But actually, the LCS
   916 is C<a x b y c z>:
   917 
   918           a x b y c z p d q
   919     a b c a x b y c z
   920 
   921 or
   922 
   923     a       x b y c z p d q
   924     a b c a x b y c z
   925 
   926 =head1 USAGE
   927 
   928 (See also the README file and several example
   929 scripts include with this module.)
   930 
   931 This module now provides an object-oriented interface that uses less
   932 memory and is easier to use than most of the previous procedural
   933 interfaces.  It also still provides several exportable functions.  We'll
   934 deal with these in ascending order of difficulty:  C<LCS>,
   935 C<LCS_length>, C<LCSidx>, OO interface, C<prepare>, C<diff>, C<sdiff>,
   936 C<traverse_sequences>, and C<traverse_balanced>.
   937 
   938 =head2 C<LCS>
   939 
   940 Given references to two lists of items, LCS returns an array containing
   941 their longest common subsequence.  In scalar context, it returns a
   942 reference to such a list.
   943 
   944     @lcs    = LCS( \@seq1, \@seq2 );
   945     $lcsref = LCS( \@seq1, \@seq2 );
   946 
   947 C<LCS> may be passed an optional third parameter; this is a CODE
   948 reference to a key generation function.  See L</KEY GENERATION
   949 FUNCTIONS>.
   950 
   951     @lcs    = LCS( \@seq1, \@seq2, \&keyGen, @args );
   952     $lcsref = LCS( \@seq1, \@seq2, \&keyGen, @args );
   953 
   954 Additional parameters, if any, will be passed to the key generation
   955 routine.
   956 
   957 =head2 C<LCS_length>
   958 
   959 This is just like C<LCS> except it only returns the length of the
   960 longest common subsequence.  This provides a performance gain of about
   961 9% compared to C<LCS>.
   962 
   963 =head2 C<LCSidx>
   964 
   965 Like C<LCS> except it returns references to two arrays.  The first array
   966 contains the indices into @seq1 where the LCS items are located.  The
   967 second array contains the indices into @seq2 where the LCS items are located.
   968 
   969 Therefore, the following three lists will contain the same values:
   970 
   971     my( $idx1, $idx2 ) = LCSidx( \@seq1, \@seq2 );
   972     my @list1 = @seq1[ @$idx1 ];
   973     my @list2 = @seq2[ @$idx2 ];
   974     my @list3 = LCS( \@seq1, \@seq2 );
   975 
   976 =head2 C<new>
   977 
   978     $diff = Algorithm::Diffs->new( \@seq1, \@seq2 );
   979     $diff = Algorithm::Diffs->new( \@seq1, \@seq2, \%opts );
   980 
   981 C<new> computes the smallest set of additions and deletions necessary
   982 to turn the first sequence into the second and compactly records them
   983 in the object.
   984 
   985 You use the object to iterate over I<hunks>, where each hunk represents
   986 a contiguous section of items which should be added, deleted, replaced,
   987 or left unchanged.
   988 
   989 =over 4
   990 
   991 The following summary of all of the methods looks a lot like Perl code
   992 but some of the symbols have different meanings:
   993 
   994     [ ]     Encloses optional arguments
   995     :       Is followed by the default value for an optional argument
   996     |       Separates alternate return results
   997 
   998 Method summary:
   999 
  1000     $obj        = Algorithm::Diff->new( \@seq1, \@seq2, [ \%opts ] );
  1001     $pos        = $obj->Next(  [ $count : 1 ] );
  1002     $revPos     = $obj->Prev(  [ $count : 1 ] );
  1003     $obj        = $obj->Reset( [ $pos : 0 ] );
  1004     $copy       = $obj->Copy(  [ $pos, [ $newBase ] ] );
  1005     $oldBase    = $obj->Base(  [ $newBase ] );
  1006 
  1007 Note that all of the following methods C<die> if used on an object that
  1008 is "reset" (not currently pointing at any hunk).
  1009 
  1010     $bits       = $obj->Diff(  );
  1011     @items|$cnt = $obj->Same(  );
  1012     @items|$cnt = $obj->Items( $seqNum );
  1013     @idxs |$cnt = $obj->Range( $seqNum, [ $base ] );
  1014     $minIdx     = $obj->Min(   $seqNum, [ $base ] );
  1015     $maxIdx     = $obj->Max(   $seqNum, [ $base ] );
  1016     @values     = $obj->Get(   @names );
  1017 
  1018 Passing in C<undef> for an optional argument is always treated the same
  1019 as if no argument were passed in.
  1020 
  1021 =item C<Next>
  1022 
  1023     $pos = $diff->Next();    # Move forward 1 hunk
  1024     $pos = $diff->Next( 2 ); # Move forward 2 hunks
  1025     $pos = $diff->Next(-5);  # Move backward 5 hunks
  1026 
  1027 C<Next> moves the object to point at the next hunk.  The object starts
  1028 out "reset", which means it isn't pointing at any hunk.  If the object
  1029 is reset, then C<Next()> moves to the first hunk.
  1030 
  1031 C<Next> returns a true value iff the move didn't go past the last hunk.
  1032 So C<Next(0)> will return true iff the object is not reset.
  1033 
  1034 Actually, C<Next> returns the object's new position, which is a number
  1035 between 1 and the number of hunks (inclusive), or returns a false value.
  1036 
  1037 =item C<Prev>
  1038 
  1039 C<Prev($N)> is almost identical to C<Next(-$N)>; it moves to the $Nth
  1040 previous hunk.  On a 'reset' object, C<Prev()> [and C<Next(-1)>] move
  1041 to the last hunk.
  1042 
  1043 The position returned by C<Prev> is relative to the I<end> of the
  1044 hunks; -1 for the last hunk, -2 for the second-to-last, etc.
  1045 
  1046 =item C<Reset>
  1047 
  1048     $diff->Reset();     # Reset the object's position
  1049     $diff->Reset($pos); # Move to the specified hunk
  1050     $diff->Reset(1);    # Move to the first hunk
  1051     $diff->Reset(-1);   # Move to the last hunk
  1052 
  1053 C<Reset> returns the object, so, for example, you could use
  1054 C<< $diff->Reset()->Next(-1) >> to get the number of hunks.
  1055 
  1056 =item C<Copy>
  1057 
  1058     $copy = $diff->Copy( $newPos, $newBase );
  1059 
  1060 C<Copy> returns a copy of the object.  The copy and the orignal object
  1061 share most of their data, so making copies takes very little memory.
  1062 The copy maintains its own position (separate from the original), which
  1063 is the main purpose of copies.  It also maintains its own base.
  1064 
  1065 By default, the copy's position starts out the same as the original
  1066 object's position.  But C<Copy> takes an optional first argument to set the
  1067 new position, so the following three snippets are equivalent:
  1068 
  1069     $copy = $diff->Copy($pos);
  1070 
  1071     $copy = $diff->Copy();
  1072     $copy->Reset($pos);
  1073 
  1074     $copy = $diff->Copy()->Reset($pos);
  1075 
  1076 C<Copy> takes an optional second argument to set the base for
  1077 the copy.  If you wish to change the base of the copy but leave
  1078 the position the same as in the original, here are two
  1079 equivalent ways:
  1080 
  1081     $copy = $diff->Copy();
  1082     $copy->Base( 0 );
  1083 
  1084     $copy = $diff->Copy(undef,0);
  1085 
  1086 Here are two equivalent way to get a "reset" copy:
  1087 
  1088     $copy = $diff->Copy(0);
  1089 
  1090     $copy = $diff->Copy()->Reset();
  1091 
  1092 =item C<Diff>
  1093 
  1094     $bits = $obj->Diff();
  1095 
  1096 C<Diff> returns a true value iff the current hunk contains items that are
  1097 different between the two sequences.  It actually returns one of the
  1098 follow 4 values:
  1099 
  1100 =over 4
  1101 
  1102 =item 3
  1103 
  1104 C<3==(1|2)>.  This hunk contains items from @seq1 and the items
  1105 from @seq2 that should replace them.  Both sequence 1 and 2
  1106 contain changed items so both the 1 and 2 bits are set.
  1107 
  1108 =item 2
  1109 
  1110 This hunk only contains items from @seq2 that should be inserted (not
  1111 items from @seq1).  Only sequence 2 contains changed items so only the 2
  1112 bit is set.
  1113 
  1114 =item 1
  1115 
  1116 This hunk only contains items from @seq1 that should be deleted (not
  1117 items from @seq2).  Only sequence 1 contains changed items so only the 1
  1118 bit is set.
  1119 
  1120 =item 0
  1121 
  1122 This means that the items in this hunk are the same in both sequences.
  1123 Neither sequence 1 nor 2 contain changed items so neither the 1 nor the
  1124 2 bits are set.
  1125 
  1126 =back
  1127 
  1128 =item C<Same>
  1129 
  1130 C<Same> returns a true value iff the current hunk contains items that
  1131 are the same in both sequences.  It actually returns the list of items
  1132 if they are the same or an emty list if they aren't.  In a scalar
  1133 context, it returns the size of the list.
  1134 
  1135 =item C<Items>
  1136 
  1137     $count = $diff->Items(2);
  1138     @items = $diff->Items($seqNum);
  1139 
  1140 C<Items> returns the (number of) items from the specified sequence that
  1141 are part of the current hunk.
  1142 
  1143 If the current hunk contains only insertions, then
  1144 C<< $diff->Items(1) >> will return an empty list (0 in a scalar conext).
  1145 If the current hunk contains only deletions, then C<< $diff->Items(2) >>
  1146 will return an empty list (0 in a scalar conext).
  1147 
  1148 If the hunk contains replacements, then both C<< $diff->Items(1) >> and
  1149 C<< $diff->Items(2) >> will return different, non-empty lists.
  1150 
  1151 Otherwise, the hunk contains identical items and all of the following
  1152 will return the same lists:
  1153 
  1154     @items = $diff->Items(1);
  1155     @items = $diff->Items(2);
  1156     @items = $diff->Same();
  1157 
  1158 =item C<Range>
  1159 
  1160     $count = $diff->Range( $seqNum );
  1161     @indices = $diff->Range( $seqNum );
  1162     @indices = $diff->Range( $seqNum, $base );
  1163 
  1164 C<Range> is like C<Items> except that it returns a list of I<indices> to
  1165 the items rather than the items themselves.  By default, the index of
  1166 the first item (in each sequence) is 0 but this can be changed by
  1167 calling the C<Base> method.  So, by default, the following two snippets
  1168 return the same lists:
  1169 
  1170     @list = $diff->Items(2);
  1171     @list = @seq2[ $diff->Range(2) ];
  1172 
  1173 You can also specify the base to use as the second argument.  So the
  1174 following two snippets I<always> return the same lists:
  1175 
  1176     @list = $diff->Items(1);
  1177     @list = @seq1[ $diff->Range(1,0) ];
  1178 
  1179 =item C<Base>
  1180 
  1181     $curBase = $diff->Base();
  1182     $oldBase = $diff->Base($newBase);
  1183 
  1184 C<Base> sets and/or returns the current base (usually 0 or 1) that is
  1185 used when you request range information.  The base defaults to 0 so
  1186 that range information is returned as array indices.  You can set the
  1187 base to 1 if you want to report traditional line numbers instead.
  1188 
  1189 =item C<Min>
  1190 
  1191     $min1 = $diff->Min(1);
  1192     $min = $diff->Min( $seqNum, $base );
  1193 
  1194 C<Min> returns the first value that C<Range> would return (given the
  1195 same arguments) or returns C<undef> if C<Range> would return an empty
  1196 list.
  1197 
  1198 =item C<Max>
  1199 
  1200 C<Max> returns the last value that C<Range> would return or C<undef>.
  1201 
  1202 =item C<Get>
  1203 
  1204     ( $n, $x, $r ) = $diff->Get(qw( min1 max1 range1 ));
  1205     @values = $diff->Get(qw( 0min2 1max2 range2 same base ));
  1206 
  1207 C<Get> returns one or more scalar values.  You pass in a list of the
  1208 names of the values you want returned.  Each name must match one of the
  1209 following regexes:
  1210 
  1211     /^(-?\d+)?(min|max)[12]$/i
  1212     /^(range[12]|same|diff|base)$/i
  1213 
  1214 The 1 or 2 after a name says which sequence you want the information
  1215 for (and where allowed, it is required).  The optional number before
  1216 "min" or "max" is the base to use.  So the following equalities hold:
  1217 
  1218     $diff->Get('min1') == $diff->Min(1)
  1219     $diff->Get('0min2') == $diff->Min(2,0)
  1220 
  1221 Using C<Get> in a scalar context when you've passed in more than one
  1222 name is a fatal error (C<die> is called).
  1223 
  1224 =back
  1225 
  1226 =head2 C<prepare>
  1227 
  1228 Given a reference to a list of items, C<prepare> returns a reference
  1229 to a hash which can be used when comparing this sequence to other
  1230 sequences with C<LCS> or C<LCS_length>.
  1231 
  1232     $prep = prepare( \@seq1 );
  1233     for $i ( 0 .. 10_000 )
  1234     {
  1235         @lcs = LCS( $prep, $seq[$i] );
  1236         # do something useful with @lcs
  1237     }
  1238 
  1239 C<prepare> may be passed an optional third parameter; this is a CODE
  1240 reference to a key generation function.  See L</KEY GENERATION
  1241 FUNCTIONS>.
  1242 
  1243     $prep = prepare( \@seq1, \&keyGen );
  1244     for $i ( 0 .. 10_000 )
  1245     {
  1246         @lcs = LCS( $seq[$i], $prep, \&keyGen );
  1247         # do something useful with @lcs
  1248     }
  1249 
  1250 Using C<prepare> provides a performance gain of about 50% when calling LCS
  1251 many times compared with not preparing.
  1252 
  1253 =head2 C<diff>
  1254 
  1255     @diffs     = diff( \@seq1, \@seq2 );
  1256     $diffs_ref = diff( \@seq1, \@seq2 );
  1257 
  1258 C<diff> computes the smallest set of additions and deletions necessary
  1259 to turn the first sequence into the second, and returns a description
  1260 of these changes.  The description is a list of I<hunks>; each hunk
  1261 represents a contiguous section of items which should be added,
  1262 deleted, or replaced.  (Hunks containing unchanged items are not
  1263 included.)
  1264 
  1265 The return value of C<diff> is a list of hunks, or, in scalar context, a
  1266 reference to such a list.  If there are no differences, the list will be
  1267 empty.
  1268 
  1269 Here is an example.  Calling C<diff> for the following two sequences:
  1270 
  1271     a b c e h j l m n p
  1272     b c d e f j k l m r s t
  1273 
  1274 would produce the following list:
  1275 
  1276     (
  1277       [ [ '-', 0, 'a' ] ],
  1278 
  1279       [ [ '+', 2, 'd' ] ],
  1280 
  1281       [ [ '-', 4, 'h' ],
  1282         [ '+', 4, 'f' ] ],
  1283 
  1284       [ [ '+', 6, 'k' ] ],
  1285 
  1286       [ [ '-',  8, 'n' ],
  1287         [ '-',  9, 'p' ],
  1288         [ '+',  9, 'r' ],
  1289         [ '+', 10, 's' ],
  1290         [ '+', 11, 't' ] ],
  1291     )
  1292 
  1293 There are five hunks here.  The first hunk says that the C<a> at
  1294 position 0 of the first sequence should be deleted (C<->).  The second
  1295 hunk says that the C<d> at position 2 of the second sequence should
  1296 be inserted (C<+>).  The third hunk says that the C<h> at position 4
  1297 of the first sequence should be removed and replaced with the C<f>
  1298 from position 4 of the second sequence.  And so on.
  1299 
  1300 C<diff> may be passed an optional third parameter; this is a CODE
  1301 reference to a key generation function.  See L</KEY GENERATION
  1302 FUNCTIONS>.
  1303 
  1304 Additional parameters, if any, will be passed to the key generation
  1305 routine.
  1306 
  1307 =head2 C<sdiff>
  1308 
  1309     @sdiffs     = sdiff( \@seq1, \@seq2 );
  1310     $sdiffs_ref = sdiff( \@seq1, \@seq2 );
  1311 
  1312 C<sdiff> computes all necessary components to show two sequences
  1313 and their minimized differences side by side, just like the
  1314 Unix-utility I<sdiff> does:
  1315 
  1316     same             same
  1317     before     |     after
  1318     old        <     -
  1319     -          >     new
  1320 
  1321 It returns a list of array refs, each pointing to an array of
  1322 display instructions. In scalar context it returns a reference
  1323 to such a list. If there are no differences, the list will have one
  1324 entry per item, each indicating that the item was unchanged.
  1325 
  1326 Display instructions consist of three elements: A modifier indicator
  1327 (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
  1328 C<c>: Element changed) and the value of the old and new elements, to
  1329 be displayed side-by-side.
  1330 
  1331 An C<sdiff> of the following two sequences:
  1332 
  1333     a b c e h j l m n p
  1334     b c d e f j k l m r s t
  1335 
  1336 results in
  1337 
  1338     ( [ '-', 'a', ''  ],
  1339       [ 'u', 'b', 'b' ],
  1340       [ 'u', 'c', 'c' ],
  1341       [ '+', '',  'd' ],
  1342       [ 'u', 'e', 'e' ],
  1343       [ 'c', 'h', 'f' ],
  1344       [ 'u', 'j', 'j' ],
  1345       [ '+', '',  'k' ],
  1346       [ 'u', 'l', 'l' ],
  1347       [ 'u', 'm', 'm' ],
  1348       [ 'c', 'n', 'r' ],
  1349       [ 'c', 'p', 's' ],
  1350       [ '+', '',  't' ],
  1351     )
  1352 
  1353 C<sdiff> may be passed an optional third parameter; this is a CODE
  1354 reference to a key generation function.  See L</KEY GENERATION
  1355 FUNCTIONS>.
  1356 
  1357 Additional parameters, if any, will be passed to the key generation
  1358 routine.
  1359 
  1360 =head2 C<compact_diff>
  1361 
  1362 C<compact_diff> is much like C<sdiff> except it returns a much more
  1363 compact description consisting of just one flat list of indices.  An
  1364 example helps explain the format:
  1365 
  1366     my @a = qw( a b c   e  h j   l m n p      );
  1367     my @b = qw(   b c d e f  j k l m    r s t );
  1368     @cdiff = compact_diff( \@a, \@b );
  1369     # Returns:
  1370     #   @a      @b       @a       @b
  1371     #  start   start   values   values
  1372     (    0,      0,   #       =
  1373          0,      0,   #    a  !
  1374          1,      0,   #  b c  =  b c
  1375          3,      2,   #       !  d
  1376          3,      3,   #    e  =  e
  1377          4,      4,   #    f  !  h
  1378          5,      5,   #    j  =  j
  1379          6,      6,   #       !  k
  1380          6,      7,   #  l m  =  l m
  1381          8,      9,   #  n p  !  r s t
  1382         10,     12,   #
  1383     );
  1384 
  1385 The 0th, 2nd, 4th, etc. entries are all indices into @seq1 (@a in the
  1386 above example) indicating where a hunk begins.  The 1st, 3rd, 5th, etc.
  1387 entries are all indices into @seq2 (@b in the above example) indicating
  1388 where the same hunk begins.
  1389 
  1390 So each pair of indices (except the last pair) describes where a hunk
  1391 begins (in each sequence).  Since each hunk must end at the item just
  1392 before the item that starts the next hunk, the next pair of indices can
  1393 be used to determine where the hunk ends.
  1394 
  1395 So, the first 4 entries (0..3) describe the first hunk.  Entries 0 and 1
  1396 describe where the first hunk begins (and so are always both 0).
  1397 Entries 2 and 3 describe where the next hunk begins, so subtracting 1
  1398 from each tells us where the first hunk ends.  That is, the first hunk
  1399 contains items C<$diff[0]> through C<$diff[2] - 1> of the first sequence
  1400 and contains items C<$diff[1]> through C<$diff[3] - 1> of the second
  1401 sequence.
  1402 
  1403 In other words, the first hunk consists of the following two lists of items:
  1404 
  1405                #  1st pair     2nd pair
  1406                # of indices   of indices
  1407     @list1 = @a[ $cdiff[0] .. $cdiff[2]-1 ];
  1408     @list2 = @b[ $cdiff[1] .. $cdiff[3]-1 ];
  1409                # Hunk start   Hunk end
  1410 
  1411 Note that the hunks will always alternate between those that are part of
  1412 the LCS (those that contain unchanged items) and those that contain
  1413 changes.  This means that all we need to be told is whether the first
  1414 hunk is a 'same' or 'diff' hunk and we can determine which of the other
  1415 hunks contain 'same' items or 'diff' items.
  1416 
  1417 By convention, we always make the first hunk contain unchanged items.
  1418 So the 1st, 3rd, 5th, etc. hunks (all odd-numbered hunks if you start
  1419 counting from 1) all contain unchanged items.  And the 2nd, 4th, 6th,
  1420 etc. hunks (all even-numbered hunks if you start counting from 1) all
  1421 contain changed items.
  1422 
  1423 Since @a and @b don't begin with the same value, the first hunk in our
  1424 example is empty (otherwise we'd violate the above convention).  Note
  1425 that the first 4 index values in our example are all zero.  Plug these
  1426 values into our previous code block and we get:
  1427 
  1428     @hunk1a = @a[ 0 .. 0-1 ];
  1429     @hunk1b = @b[ 0 .. 0-1 ];
  1430 
  1431 And C<0..-1> returns the empty list.
  1432 
  1433 Move down one pair of indices (2..5) and we get the offset ranges for
  1434 the second hunk, which contains changed items.
  1435 
  1436 Since C<@diff[2..5]> contains (0,0,1,0) in our example, the second hunk
  1437 consists of these two lists of items:
  1438 
  1439         @hunk2a = @a[ $cdiff[2] .. $cdiff[4]-1 ];
  1440         @hunk2b = @b[ $cdiff[3] .. $cdiff[5]-1 ];
  1441     # or
  1442         @hunk2a = @a[ 0 .. 1-1 ];
  1443         @hunk2b = @b[ 0 .. 0-1 ];
  1444     # or
  1445         @hunk2a = @a[ 0 .. 0 ];
  1446         @hunk2b = @b[ 0 .. -1 ];
  1447     # or
  1448         @hunk2a = ( 'a' );
  1449         @hunk2b = ( );
  1450 
  1451 That is, we would delete item 0 ('a') from @a.
  1452 
  1453 Since C<@diff[4..7]> contains (1,0,3,2) in our example, the third hunk
  1454 consists of these two lists of items:
  1455 
  1456         @hunk3a = @a[ $cdiff[4] .. $cdiff[6]-1 ];
  1457         @hunk3a = @b[ $cdiff[5] .. $cdiff[7]-1 ];
  1458     # or
  1459         @hunk3a = @a[ 1 .. 3-1 ];
  1460         @hunk3a = @b[ 0 .. 2-1 ];
  1461     # or
  1462         @hunk3a = @a[ 1 .. 2 ];
  1463         @hunk3a = @b[ 0 .. 1 ];
  1464     # or
  1465         @hunk3a = qw( b c );
  1466         @hunk3a = qw( b c );
  1467 
  1468 Note that this third hunk contains unchanged items as our convention demands.
  1469 
  1470 You can continue this process until you reach the last two indices,
  1471 which will always be the number of items in each sequence.  This is
  1472 required so that subtracting one from each will give you the indices to
  1473 the last items in each sequence.
  1474 
  1475 =head2 C<traverse_sequences>
  1476 
  1477 C<traverse_sequences> used to be the most general facility provided by
  1478 this module (the new OO interface is more powerful and much easier to
  1479 use).
  1480 
  1481 Imagine that there are two arrows.  Arrow A points to an element of
  1482 sequence A, and arrow B points to an element of the sequence B. 
  1483 Initially, the arrows point to the first elements of the respective
  1484 sequences.  C<traverse_sequences> will advance the arrows through the
  1485 sequences one element at a time, calling an appropriate user-specified
  1486 callback function before each advance.  It willadvance the arrows in
  1487 such a way that if there are equal elements C<$A[$i]> and C<$B[$j]>
  1488 which are equal and which are part of the LCS, there will be some moment
  1489 during the execution of C<traverse_sequences> when arrow A is pointing
  1490 to C<$A[$i]> and arrow B is pointing to C<$B[$j]>.  When this happens,
  1491 C<traverse_sequences> will call the C<MATCH> callback function and then
  1492 it will advance both arrows.
  1493 
  1494 Otherwise, one of the arrows is pointing to an element of its sequence
  1495 that is not part of the LCS.  C<traverse_sequences> will advance that
  1496 arrow and will call the C<DISCARD_A> or the C<DISCARD_B> callback,
  1497 depending on which arrow it advanced.  If both arrows point to elements
  1498 that are not part of the LCS, then C<traverse_sequences> will advance
  1499 one of them and call the appropriate callback, but it is not specified
  1500 which it will call.
  1501 
  1502 The arguments to C<traverse_sequences> are the two sequences to
  1503 traverse, and a hash which specifies the callback functions, like this:
  1504 
  1505     traverse_sequences(
  1506         \@seq1, \@seq2,
  1507         {   MATCH => $callback_1,
  1508             DISCARD_A => $callback_2,
  1509             DISCARD_B => $callback_3,
  1510         }
  1511     );
  1512 
  1513 Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least
  1514 the indices of the two arrows as their arguments.  They are not expected
  1515 to return any values.  If a callback is omitted from the table, it is
  1516 not called.
  1517 
  1518 Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
  1519 corresponding index in A or B.
  1520 
  1521 If arrow A reaches the end of its sequence, before arrow B does,
  1522 C<traverse_sequences> will call the C<A_FINISHED> callback when it
  1523 advances arrow B, if there is such a function; if not it will call
  1524 C<DISCARD_B> instead.  Similarly if arrow B finishes first. 
  1525 C<traverse_sequences> returns when both arrows are at the ends of their
  1526 respective sequences.  It returns true on success and false on failure. 
  1527 At present there is no way to fail.
  1528 
  1529 C<traverse_sequences> may be passed an optional fourth parameter; this
  1530 is a CODE reference to a key generation function.  See L</KEY GENERATION
  1531 FUNCTIONS>.
  1532 
  1533 Additional parameters, if any, will be passed to the key generation function.
  1534 
  1535 If you want to pass additional parameters to your callbacks, but don't
  1536 need a custom key generation function, you can get the default by
  1537 passing undef:
  1538 
  1539     traverse_sequences(
  1540         \@seq1, \@seq2,
  1541         {   MATCH => $callback_1,
  1542             DISCARD_A => $callback_2,
  1543             DISCARD_B => $callback_3,
  1544         },
  1545         undef,     # default key-gen
  1546         $myArgument1,
  1547         $myArgument2,
  1548         $myArgument3,
  1549     );
  1550 
  1551 C<traverse_sequences> does not have a useful return value; you are
  1552 expected to plug in the appropriate behavior with the callback
  1553 functions.
  1554 
  1555 =head2 C<traverse_balanced>
  1556 
  1557 C<traverse_balanced> is an alternative to C<traverse_sequences>. It
  1558 uses a different algorithm to iterate through the entries in the
  1559 computed LCS. Instead of sticking to one side and showing element changes
  1560 as insertions and deletions only, it will jump back and forth between
  1561 the two sequences and report I<changes> occurring as deletions on one
  1562 side followed immediatly by an insertion on the other side.
  1563 
  1564 In addition to the C<DISCARD_A>, C<DISCARD_B>, and C<MATCH> callbacks
  1565 supported by C<traverse_sequences>, C<traverse_balanced> supports
  1566 a C<CHANGE> callback indicating that one element got C<replaced> by another:
  1567 
  1568     traverse_balanced(
  1569         \@seq1, \@seq2,
  1570         {   MATCH => $callback_1,
  1571             DISCARD_A => $callback_2,
  1572             DISCARD_B => $callback_3,
  1573             CHANGE    => $callback_4,
  1574         }
  1575     );
  1576 
  1577 If no C<CHANGE> callback is specified, C<traverse_balanced>
  1578 will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
  1579 therefore resulting in a similar behaviour as C<traverse_sequences>
  1580 with different order of events.
  1581 
  1582 C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
  1583 noticable only while processing huge amounts of data.
  1584 
  1585 The C<sdiff> function of this module 
  1586 is implemented as call to C<traverse_balanced>.
  1587 
  1588 C<traverse_balanced> does not have a useful return value; you are expected to
  1589 plug in the appropriate behavior with the callback functions.
  1590 
  1591 =head1 KEY GENERATION FUNCTIONS
  1592 
  1593 Most of the functions accept an optional extra parameter.  This is a
  1594 CODE reference to a key generating (hashing) function that should return
  1595 a string that uniquely identifies a given element.  It should be the
  1596 case that if two elements are to be considered equal, their keys should
  1597 be the same (and the other way around).  If no key generation function
  1598 is provided, the key will be the element as a string.
  1599 
  1600 By default, comparisons will use "eq" and elements will be turned into keys
  1601 using the default stringizing operator '""'.
  1602 
  1603 Where this is important is when you're comparing something other than
  1604 strings.  If it is the case that you have multiple different objects
  1605 that should be considered to be equal, you should supply a key
  1606 generation function. Otherwise, you have to make sure that your arrays
  1607 contain unique references.
  1608 
  1609 For instance, consider this example:
  1610 
  1611     package Person;
  1612 
  1613     sub new
  1614     {
  1615         my $package = shift;
  1616         return bless { name => '', ssn => '', @_ }, $package;
  1617     }
  1618 
  1619     sub clone
  1620     {
  1621         my $old = shift;
  1622         my $new = bless { %$old }, ref($old);
  1623     }
  1624 
  1625     sub hash
  1626     {
  1627         return shift()->{'ssn'};
  1628     }
  1629 
  1630     my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
  1631     my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
  1632     my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
  1633     my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
  1634     my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
  1635 
  1636 If you did this:
  1637 
  1638     my $array1 = [ $person1, $person2, $person4 ];
  1639     my $array2 = [ $person1, $person3, $person4, $person5 ];
  1640     Algorithm::Diff::diff( $array1, $array2 );
  1641 
  1642 everything would work out OK (each of the objects would be converted
  1643 into a string like "Person=HASH(0x82425b0)" for comparison).
  1644 
  1645 But if you did this:
  1646 
  1647     my $array1 = [ $person1, $person2, $person4 ];
  1648     my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
  1649     Algorithm::Diff::diff( $array1, $array2 );
  1650 
  1651 $person4 and $person4->clone() (which have the same name and SSN)
  1652 would be seen as different objects. If you wanted them to be considered
  1653 equivalent, you would have to pass in a key generation function:
  1654 
  1655     my $array1 = [ $person1, $person2, $person4 ];
  1656     my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
  1657     Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
  1658 
  1659 This would use the 'ssn' field in each Person as a comparison key, and
  1660 so would consider $person4 and $person4->clone() as equal.
  1661 
  1662 You may also pass additional parameters to the key generation function
  1663 if you wish.
  1664 
  1665 =head1 ERROR CHECKING
  1666 
  1667 If you pass these routines a non-reference and they expect a reference,
  1668 they will die with a message.
  1669 
  1670 =head1 AUTHOR
  1671 
  1672 This version released by Tye McQueen (http://perlmonks.org/?node=tye).
  1673 
  1674 =head1 LICENSE
  1675 
  1676 Parts Copyright (c) 2000-2004 Ned Konz.  All rights reserved.
  1677 Parts by Tye McQueen.
  1678 
  1679 This program is free software; you can redistribute it and/or modify it
  1680 under the same terms as Perl.
  1681 
  1682 =head1 MAILING LIST
  1683 
  1684 Mark-Jason still maintains a mailing list.  To join a low-volume mailing
  1685 list for announcements related to diff and Algorithm::Diff, send an
  1686 empty mail message to mjd-perl-diff-request@plover.com.
  1687 
  1688 =head1 CREDITS
  1689 
  1690 Versions through 0.59 (and much of this documentation) were written by:
  1691 
  1692 Mark-Jason Dominus, mjd-perl-diff@plover.com
  1693 
  1694 This version borrows some documentation and routine names from
  1695 Mark-Jason's, but Diff.pm's code was completely replaced.
  1696 
  1697 This code was adapted from the Smalltalk code of Mario Wolczko
  1698 <mario@wolczko.com>, which is available at
  1699 ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
  1700 
  1701 C<sdiff> and C<traverse_balanced> were written by Mike Schilli
  1702 <m@perlmeister.com>.
  1703 
  1704 The algorithm is that described in
  1705 I<A Fast Algorithm for Computing Longest Common Subsequences>,
  1706 CACM, vol.20, no.5, pp.350-353, May 1977, with a few
  1707 minor improvements to improve the speed.
  1708 
  1709 Much work was done by Ned Konz (perl@bike-nomad.com).
  1710 
  1711 The OO interface and some other changes are by Tye McQueen.
  1712 
  1713 =cut