lib/CPAN/lib/Algorithm/Diff.pm
changeset 0 414e01d06fd5
equal deleted inserted replaced
-1:000000000000 0:414e01d06fd5
       
     1 package Algorithm::Diff;
       
     2 # Skip to first "=head" line for documentation.
       
     3 use strict;
       
     4 
       
     5 use integer;    # see below in _replaceNextLargerWith() for mod to make
       
     6                 # if you don't use this
       
     7 use vars qw( $VERSION @EXPORT_OK );
       
     8 $VERSION = 1.19_01;
       
     9 #          ^ ^^ ^^-- Incremented at will
       
    10 #          | \+----- Incremented for non-trivial changes to features
       
    11 #          \-------- Incremented for fundamental changes
       
    12 require Exporter;
       
    13 *import    = \&Exporter::import;
       
    14 @EXPORT_OK = qw(
       
    15     prepare LCS LCDidx LCS_length
       
    16     diff sdiff compact_diff
       
    17     traverse_sequences traverse_balanced
       
    18 );
       
    19 
       
    20 # McIlroy-Hunt diff algorithm
       
    21 # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
       
    22 # by Ned Konz, perl@bike-nomad.com
       
    23 # Updates by Tye McQueen, http://perlmonks.org/?node=tye
       
    24 
       
    25 # Create a hash that maps each element of $aCollection to the set of
       
    26 # positions it occupies in $aCollection, restricted to the elements
       
    27 # within the range of indexes specified by $start and $end.
       
    28 # The fourth parameter is a subroutine reference that will be called to
       
    29 # generate a string to use as a key.
       
    30 # Additional parameters, if any, will be passed to this subroutine.
       
    31 #
       
    32 # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
       
    33 
       
    34 sub _withPositionsOfInInterval
       
    35 {
       
    36     my $aCollection = shift;    # array ref
       
    37     my $start       = shift;
       
    38     my $end         = shift;
       
    39     my $keyGen      = shift;
       
    40     my %d;
       
    41     my $index;
       
    42     for ( $index = $start ; $index <= $end ; $index++ )
       
    43     {
       
    44         my $element = $aCollection->[$index];
       
    45         my $key = &$keyGen( $element, @_ );
       
    46         if ( exists( $d{$key} ) )
       
    47         {
       
    48             unshift ( @{ $d{$key} }, $index );
       
    49         }
       
    50         else
       
    51         {
       
    52             $d{$key} = [$index];
       
    53         }
       
    54     }
       
    55     return wantarray ? %d : \%d;
       
    56 }
       
    57 
       
    58 # Find the place at which aValue would normally be inserted into the
       
    59 # array. If that place is already occupied by aValue, do nothing, and
       
    60 # return undef. If the place does not exist (i.e., it is off the end of
       
    61 # the array), add it to the end, otherwise replace the element at that
       
    62 # point with aValue.  It is assumed that the array's values are numeric.
       
    63 # This is where the bulk (75%) of the time is spent in this module, so
       
    64 # try to make it fast!
       
    65 
       
    66 sub _replaceNextLargerWith
       
    67 {
       
    68     my ( $array, $aValue, $high ) = @_;
       
    69     $high ||= $#$array;
       
    70 
       
    71     # off the end?
       
    72     if ( $high == -1 || $aValue > $array->[-1] )
       
    73     {
       
    74         push ( @$array, $aValue );
       
    75         return $high + 1;
       
    76     }
       
    77 
       
    78     # binary search for insertion point...
       
    79     my $low = 0;
       
    80     my $index;
       
    81     my $found;
       
    82     while ( $low <= $high )
       
    83     {
       
    84         $index = ( $high + $low ) / 2;
       
    85 
       
    86         # $index = int(( $high + $low ) / 2);  # without 'use integer'
       
    87         $found = $array->[$index];
       
    88 
       
    89         if ( $aValue == $found )
       
    90         {
       
    91             return undef;
       
    92         }
       
    93         elsif ( $aValue > $found )
       
    94         {
       
    95             $low = $index + 1;
       
    96         }
       
    97         else
       
    98         {
       
    99             $high = $index - 1;
       
   100         }
       
   101     }
       
   102 
       
   103     # now insertion point is in $low.
       
   104     $array->[$low] = $aValue;    # overwrite next larger
       
   105     return $low;
       
   106 }
       
   107 
       
   108 # This method computes the longest common subsequence in $a and $b.
       
   109 
       
   110 # Result is array or ref, whose contents is such that
       
   111 #   $a->[ $i ] == $b->[ $result[ $i ] ]
       
   112 # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
       
   113 
       
   114 # An additional argument may be passed; this is a hash or key generating
       
   115 # function that should return a string that uniquely identifies the given
       
   116 # element.  It should be the case that if the key is the same, the elements
       
   117 # will compare the same. If this parameter is undef or missing, the key
       
   118 # will be the element as a string.
       
   119 
       
   120 # By default, comparisons will use "eq" and elements will be turned into keys
       
   121 # using the default stringizing operator '""'.
       
   122 
       
   123 # Additional parameters, if any, will be passed to the key generation
       
   124 # routine.
       
   125 
       
   126 sub _longestCommonSubsequence
       
   127 {
       
   128     my $a        = shift;    # array ref or hash ref
       
   129     my $b        = shift;    # array ref or hash ref
       
   130     my $counting = shift;    # scalar
       
   131     my $keyGen   = shift;    # code ref
       
   132     my $compare;             # code ref
       
   133 
       
   134     if ( ref($a) eq 'HASH' )
       
   135     {                        # prepared hash must be in $b
       
   136         my $tmp = $b;
       
   137         $b = $a;
       
   138         $a = $tmp;
       
   139     }
       
   140 
       
   141     # Check for bogus (non-ref) argument values
       
   142     if ( !ref($a) || !ref($b) )
       
   143     {
       
   144         my @callerInfo = caller(1);
       
   145         die 'error: must pass array or hash references to ' . $callerInfo[3];
       
   146     }
       
   147 
       
   148     # set up code refs
       
   149     # Note that these are optimized.
       
   150     if ( !defined($keyGen) )    # optimize for strings
       
   151     {
       
   152         $keyGen = sub { $_[0] };
       
   153         $compare = sub { my ( $a, $b ) = @_; $a eq $b };
       
   154     }
       
   155     else
       
   156     {
       
   157         $compare = sub {
       
   158             my $a = shift;
       
   159             my $b = shift;
       
   160             &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
       
   161         };
       
   162     }
       
   163 
       
   164     my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
       
   165     my ( $prunedCount, $bMatches ) = ( 0, {} );
       
   166 
       
   167     if ( ref($b) eq 'HASH' )    # was $bMatches prepared for us?
       
   168     {
       
   169         $bMatches = $b;
       
   170     }
       
   171     else
       
   172     {
       
   173         my ( $bStart, $bFinish ) = ( 0, $#$b );
       
   174 
       
   175         # First we prune off any common elements at the beginning
       
   176         while ( $aStart <= $aFinish
       
   177             and $bStart <= $bFinish
       
   178             and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
       
   179         {
       
   180             $matchVector->[ $aStart++ ] = $bStart++;
       
   181             $prunedCount++;
       
   182         }
       
   183 
       
   184         # now the end
       
   185         while ( $aStart <= $aFinish
       
   186             and $bStart <= $bFinish
       
   187             and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
       
   188         {
       
   189             $matchVector->[ $aFinish-- ] = $bFinish--;
       
   190             $prunedCount++;
       
   191         }
       
   192 
       
   193         # Now compute the equivalence classes of positions of elements
       
   194         $bMatches =
       
   195           _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
       
   196     }
       
   197     my $thresh = [];
       
   198     my $links  = [];
       
   199 
       
   200     my ( $i, $ai, $j, $k );
       
   201     for ( $i = $aStart ; $i <= $aFinish ; $i++ )
       
   202     {
       
   203         $ai = &$keyGen( $a->[$i], @_ );
       
   204         if ( exists( $bMatches->{$ai} ) )
       
   205         {
       
   206             $k = 0;
       
   207             for $j ( @{ $bMatches->{$ai} } )
       
   208             {
       
   209 
       
   210                 # optimization: most of the time this will be true
       
   211                 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
       
   212                 {
       
   213                     $thresh->[$k] = $j;
       
   214                 }
       
   215                 else
       
   216                 {
       
   217                     $k = _replaceNextLargerWith( $thresh, $j, $k );
       
   218                 }
       
   219 
       
   220                 # oddly, it's faster to always test this (CPU cache?).
       
   221                 if ( defined($k) )
       
   222                 {
       
   223                     $links->[$k] =
       
   224                       [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
       
   225                 }
       
   226             }
       
   227         }
       
   228     }
       
   229 
       
   230     if (@$thresh)
       
   231     {
       
   232         return $prunedCount + @$thresh if $counting;
       
   233         for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
       
   234         {
       
   235             $matchVector->[ $link->[1] ] = $link->[2];
       
   236         }
       
   237     }
       
   238     elsif ($counting)
       
   239     {
       
   240         return $prunedCount;
       
   241     }
       
   242 
       
   243     return wantarray ? @$matchVector : $matchVector;
       
   244 }
       
   245 
       
   246 sub traverse_sequences
       
   247 {
       
   248     my $a                 = shift;          # array ref
       
   249     my $b                 = shift;          # array ref
       
   250     my $callbacks         = shift || {};
       
   251     my $keyGen            = shift;
       
   252     my $matchCallback     = $callbacks->{'MATCH'} || sub { };
       
   253     my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
       
   254     my $finishedACallback = $callbacks->{'A_FINISHED'};
       
   255     my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
       
   256     my $finishedBCallback = $callbacks->{'B_FINISHED'};
       
   257     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
       
   258 
       
   259     # Process all the lines in @$matchVector
       
   260     my $lastA = $#$a;
       
   261     my $lastB = $#$b;
       
   262     my $bi    = 0;
       
   263     my $ai;
       
   264 
       
   265     for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
       
   266     {
       
   267         my $bLine = $matchVector->[$ai];
       
   268         if ( defined($bLine) )    # matched
       
   269         {
       
   270             &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
       
   271             &$matchCallback( $ai,    $bi++, @_ );
       
   272         }
       
   273         else
       
   274         {
       
   275             &$discardACallback( $ai, $bi, @_ );
       
   276         }
       
   277     }
       
   278 
       
   279     # The last entry (if any) processed was a match.
       
   280     # $ai and $bi point just past the last matching lines in their sequences.
       
   281 
       
   282     while ( $ai <= $lastA or $bi <= $lastB )
       
   283     {
       
   284 
       
   285         # last A?
       
   286         if ( $ai == $lastA + 1 and $bi <= $lastB )
       
   287         {
       
   288             if ( defined($finishedACallback) )
       
   289             {
       
   290                 &$finishedACallback( $lastA, @_ );
       
   291                 $finishedACallback = undef;
       
   292             }
       
   293             else
       
   294             {
       
   295                 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
       
   296             }
       
   297         }
       
   298 
       
   299         # last B?
       
   300         if ( $bi == $lastB + 1 and $ai <= $lastA )
       
   301         {
       
   302             if ( defined($finishedBCallback) )
       
   303             {
       
   304                 &$finishedBCallback( $lastB, @_ );
       
   305                 $finishedBCallback = undef;
       
   306             }
       
   307             else
       
   308             {
       
   309                 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
       
   310             }
       
   311         }
       
   312 
       
   313         &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
       
   314         &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
       
   315     }
       
   316 
       
   317     return 1;
       
   318 }
       
   319 
       
   320 sub traverse_balanced
       
   321 {
       
   322     my $a                 = shift;              # array ref
       
   323     my $b                 = shift;              # array ref
       
   324     my $callbacks         = shift || {};
       
   325     my $keyGen            = shift;
       
   326     my $matchCallback     = $callbacks->{'MATCH'} || sub { };
       
   327     my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
       
   328     my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
       
   329     my $changeCallback    = $callbacks->{'CHANGE'};
       
   330     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
       
   331 
       
   332     # Process all the lines in match vector
       
   333     my $lastA = $#$a;
       
   334     my $lastB = $#$b;
       
   335     my $bi    = 0;
       
   336     my $ai    = 0;
       
   337     my $ma    = -1;
       
   338     my $mb;
       
   339 
       
   340     while (1)
       
   341     {
       
   342 
       
   343         # Find next match indices $ma and $mb
       
   344         do {
       
   345             $ma++;
       
   346         } while(
       
   347                 $ma <= $#$matchVector
       
   348             &&  !defined $matchVector->[$ma]
       
   349         );
       
   350 
       
   351         last if $ma > $#$matchVector;    # end of matchVector?
       
   352         $mb = $matchVector->[$ma];
       
   353 
       
   354         # Proceed with discard a/b or change events until
       
   355         # next match
       
   356         while ( $ai < $ma || $bi < $mb )
       
   357         {
       
   358 
       
   359             if ( $ai < $ma && $bi < $mb )
       
   360             {
       
   361 
       
   362                 # Change
       
   363                 if ( defined $changeCallback )
       
   364                 {
       
   365                     &$changeCallback( $ai++, $bi++, @_ );
       
   366                 }
       
   367                 else
       
   368                 {
       
   369                     &$discardACallback( $ai++, $bi, @_ );
       
   370                     &$discardBCallback( $ai, $bi++, @_ );
       
   371                 }
       
   372             }
       
   373             elsif ( $ai < $ma )
       
   374             {
       
   375                 &$discardACallback( $ai++, $bi, @_ );
       
   376             }
       
   377             else
       
   378             {
       
   379 
       
   380                 # $bi < $mb
       
   381                 &$discardBCallback( $ai, $bi++, @_ );
       
   382             }
       
   383         }
       
   384 
       
   385         # Match
       
   386         &$matchCallback( $ai++, $bi++, @_ );
       
   387     }
       
   388 
       
   389     while ( $ai <= $lastA || $bi <= $lastB )
       
   390     {
       
   391         if ( $ai <= $lastA && $bi <= $lastB )
       
   392         {
       
   393 
       
   394             # Change
       
   395             if ( defined $changeCallback )
       
   396             {
       
   397                 &$changeCallback( $ai++, $bi++, @_ );
       
   398             }
       
   399             else
       
   400             {
       
   401                 &$discardACallback( $ai++, $bi, @_ );
       
   402                 &$discardBCallback( $ai, $bi++, @_ );
       
   403             }
       
   404         }
       
   405         elsif ( $ai <= $lastA )
       
   406         {
       
   407             &$discardACallback( $ai++, $bi, @_ );
       
   408         }
       
   409         else
       
   410         {
       
   411 
       
   412             # $bi <= $lastB
       
   413             &$discardBCallback( $ai, $bi++, @_ );
       
   414         }
       
   415     }
       
   416 
       
   417     return 1;
       
   418 }
       
   419 
       
   420 sub prepare
       
   421 {
       
   422     my $a       = shift;    # array ref
       
   423     my $keyGen  = shift;    # code ref
       
   424 
       
   425     # set up code ref
       
   426     $keyGen = sub { $_[0] } unless defined($keyGen);
       
   427 
       
   428     return scalar _withPositionsOfInInterval( $a, 0, $#$a, $keyGen, @_ );
       
   429 }
       
   430 
       
   431 sub LCS
       
   432 {
       
   433     my $a = shift;                  # array ref
       
   434     my $b = shift;                  # array ref or hash ref
       
   435     my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
       
   436     my @retval;
       
   437     my $i;
       
   438     for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
       
   439     {
       
   440         if ( defined( $matchVector->[$i] ) )
       
   441         {
       
   442             push ( @retval, $a->[$i] );
       
   443         }
       
   444     }
       
   445     return wantarray ? @retval : \@retval;
       
   446 }
       
   447 
       
   448 sub LCS_length
       
   449 {
       
   450     my $a = shift;                          # array ref
       
   451     my $b = shift;                          # array ref or hash ref
       
   452     return _longestCommonSubsequence( $a, $b, 1, @_ );
       
   453 }
       
   454 
       
   455 sub LCSidx
       
   456 {
       
   457     my $a= shift @_;
       
   458     my $b= shift @_;
       
   459     my $match= _longestCommonSubsequence( $a, $b, 0, @_ );
       
   460     my @am= grep defined $match->[$_], 0..$#$match;
       
   461     my @bm= @{$match}[@am];
       
   462     return \@am, \@bm;
       
   463 }
       
   464 
       
   465 sub compact_diff
       
   466 {
       
   467     my $a= shift @_;
       
   468     my $b= shift @_;
       
   469     my( $am, $bm )= LCSidx( $a, $b, @_ );
       
   470     my @cdiff;
       
   471     my( $ai, $bi )= ( 0, 0 );
       
   472     push @cdiff, $ai, $bi;
       
   473     while( 1 ) {
       
   474         while(  @$am  &&  $ai == $am->[0]  &&  $bi == $bm->[0]  ) {
       
   475             shift @$am;
       
   476             shift @$bm;
       
   477             ++$ai, ++$bi;
       
   478         }
       
   479         push @cdiff, $ai, $bi;
       
   480         last   if  ! @$am;
       
   481         $ai = $am->[0];
       
   482         $bi = $bm->[0];
       
   483         push @cdiff, $ai, $bi;
       
   484     }
       
   485     push @cdiff, 0+@$a, 0+@$b
       
   486         if  $ai < @$a || $bi < @$b;
       
   487     return wantarray ? @cdiff : \@cdiff;
       
   488 }
       
   489 
       
   490 sub diff
       
   491 {
       
   492     my $a      = shift;    # array ref
       
   493     my $b      = shift;    # array ref
       
   494     my $retval = [];
       
   495     my $hunk   = [];
       
   496     my $discard = sub {
       
   497         push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
       
   498     };
       
   499     my $add = sub {
       
   500         push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
       
   501     };
       
   502     my $match = sub {
       
   503         push @$retval, $hunk
       
   504             if 0 < @$hunk;
       
   505         $hunk = []
       
   506     };
       
   507     traverse_sequences( $a, $b,
       
   508         { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
       
   509     &$match();
       
   510     return wantarray ? @$retval : $retval;
       
   511 }
       
   512 
       
   513 sub sdiff
       
   514 {
       
   515     my $a      = shift;    # array ref
       
   516     my $b      = shift;    # array ref
       
   517     my $retval = [];
       
   518     my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
       
   519     my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
       
   520     my $change = sub {
       
   521         push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
       
   522     };
       
   523     my $match = sub {
       
   524         push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
       
   525     };
       
   526     traverse_balanced(
       
   527         $a,
       
   528         $b,
       
   529         {
       
   530             MATCH     => $match,
       
   531             DISCARD_A => $discard,
       
   532             DISCARD_B => $add,
       
   533             CHANGE    => $change,
       
   534         },
       
   535         @_
       
   536     );
       
   537     return wantarray ? @$retval : $retval;
       
   538 }
       
   539 
       
   540 ########################################
       
   541 my $Root= __PACKAGE__;
       
   542 package Algorithm::Diff::_impl;
       
   543 use strict;
       
   544 
       
   545 sub _Idx()  { 0 } # $me->[_Idx]: Ref to array of hunk indices
       
   546             # 1   # $me->[1]: Ref to first sequence
       
   547             # 2   # $me->[2]: Ref to second sequence
       
   548 sub _End()  { 3 } # $me->[_End]: Diff between forward and reverse pos
       
   549 sub _Same() { 4 } # $me->[_Same]: 1 if pos 1 contains unchanged items
       
   550 sub _Base() { 5 } # $me->[_Base]: Added to range's min and max
       
   551 sub _Pos()  { 6 } # $me->[_Pos]: Which hunk is currently selected
       
   552 sub _Off()  { 7 } # $me->[_Off]: Offset into _Idx for current position
       
   553 sub _Min() { -2 } # Added to _Off to get min instead of max+1
       
   554 
       
   555 sub Die
       
   556 {
       
   557     require Carp;
       
   558     Carp::confess( @_ );
       
   559 }
       
   560 
       
   561 sub _ChkPos
       
   562 {
       
   563     my( $me )= @_;
       
   564     return   if  $me->[_Pos];
       
   565     my $meth= ( caller(1) )[3];
       
   566     Die( "Called $meth on 'reset' object" );
       
   567 }
       
   568 
       
   569 sub _ChkSeq
       
   570 {
       
   571     my( $me, $seq )= @_;
       
   572     return $seq + $me->[_Off]
       
   573         if  1 == $seq  ||  2 == $seq;
       
   574     my $meth= ( caller(1) )[3];
       
   575     Die( "$meth: Invalid sequence number ($seq); must be 1 or 2" );
       
   576 }
       
   577 
       
   578 sub getObjPkg
       
   579 {
       
   580     my( $us )= @_;
       
   581     return ref $us   if  ref $us;
       
   582     return $us . "::_obj";
       
   583 }
       
   584 
       
   585 sub new
       
   586 {
       
   587     my( $us, $seq1, $seq2, $opts ) = @_;
       
   588     my @args;
       
   589     for( $opts->{keyGen} ) {
       
   590         push @args, $_   if  $_;
       
   591     }
       
   592     for( $opts->{keyGenArgs} ) {
       
   593         push @args, @$_   if  $_;
       
   594     }
       
   595     my $cdif= Algorithm::Diff::compact_diff( $seq1, $seq2, @args );
       
   596     my $same= 1;
       
   597     if(  0 == $cdif->[2]  &&  0 == $cdif->[3]  ) {
       
   598         $same= 0;
       
   599         splice @$cdif, 0, 2;
       
   600     }
       
   601     my @obj= ( $cdif, $seq1, $seq2 );
       
   602     $obj[_End] = (1+@$cdif)/2;
       
   603     $obj[_Same] = $same;
       
   604     $obj[_Base] = 0;
       
   605     my $me = bless \@obj, $us->getObjPkg();
       
   606     $me->Reset( 0 );
       
   607     return $me;
       
   608 }
       
   609 
       
   610 sub Reset
       
   611 {
       
   612     my( $me, $pos )= @_;
       
   613     $pos= int( $pos || 0 );
       
   614     $pos += $me->[_End]
       
   615         if  $pos < 0;
       
   616     $pos= 0
       
   617         if  $pos < 0  ||  $me->[_End] <= $pos;
       
   618     $me->[_Pos]= $pos || !1;
       
   619     $me->[_Off]= 2*$pos - 1;
       
   620     return $me;
       
   621 }
       
   622 
       
   623 sub Base
       
   624 {
       
   625     my( $me, $base )= @_;
       
   626     my $oldBase= $me->[_Base];
       
   627     $me->[_Base]= 0+$base   if  defined $base;
       
   628     return $oldBase;
       
   629 }
       
   630 
       
   631 sub Copy
       
   632 {
       
   633     my( $me, $pos, $base )= @_;
       
   634     my @obj= @$me;
       
   635     my $you= bless \@obj, ref($me);
       
   636     $you->Reset( $pos )   if  defined $pos;
       
   637     $you->Base( $base );
       
   638     return $you;
       
   639 }
       
   640 
       
   641 sub Next {
       
   642     my( $me, $steps )= @_;
       
   643     $steps= 1   if  ! defined $steps;
       
   644     if( $steps ) {
       
   645         my $pos= $me->[_Pos];
       
   646         my $new= $pos + $steps;
       
   647         $new= 0   if  $pos  &&  $new < 0;
       
   648         $me->Reset( $new )
       
   649     }
       
   650     return $me->[_Pos];
       
   651 }
       
   652 
       
   653 sub Prev {
       
   654     my( $me, $steps )= @_;
       
   655     $steps= 1   if  ! defined $steps;
       
   656     my $pos= $me->Next(-$steps);
       
   657     $pos -= $me->[_End]   if  $pos;
       
   658     return $pos;
       
   659 }
       
   660 
       
   661 sub Diff {
       
   662     my( $me )= @_;
       
   663     $me->_ChkPos();
       
   664     return 0   if  $me->[_Same] == ( 1 & $me->[_Pos] );
       
   665     my $ret= 0;
       
   666     my $off= $me->[_Off];
       
   667     for my $seq ( 1, 2 ) {
       
   668         $ret |= $seq
       
   669             if  $me->[_Idx][ $off + $seq + _Min ]
       
   670             <   $me->[_Idx][ $off + $seq ];
       
   671     }
       
   672     return $ret;
       
   673 }
       
   674 
       
   675 sub Min {
       
   676     my( $me, $seq, $base )= @_;
       
   677     $me->_ChkPos();
       
   678     my $off= $me->_ChkSeq($seq);
       
   679     $base= $me->[_Base] if !defined $base;
       
   680     return $base + $me->[_Idx][ $off + _Min ];
       
   681 }
       
   682 
       
   683 sub Max {
       
   684     my( $me, $seq, $base )= @_;
       
   685     $me->_ChkPos();
       
   686     my $off= $me->_ChkSeq($seq);
       
   687     $base= $me->[_Base] if !defined $base;
       
   688     return $base + $me->[_Idx][ $off ] -1;
       
   689 }
       
   690 
       
   691 sub Range {
       
   692     my( $me, $seq, $base )= @_;
       
   693     $me->_ChkPos();
       
   694     my $off = $me->_ChkSeq($seq);
       
   695     if( !wantarray ) {
       
   696         return  $me->[_Idx][ $off ]
       
   697             -   $me->[_Idx][ $off + _Min ];
       
   698     }
       
   699     $base= $me->[_Base] if !defined $base;
       
   700     return  ( $base + $me->[_Idx][ $off + _Min ] )
       
   701         ..  ( $base + $me->[_Idx][ $off ] - 1 );
       
   702 }
       
   703 
       
   704 sub Items {
       
   705     my( $me, $seq )= @_;
       
   706     $me->_ChkPos();
       
   707     my $off = $me->_ChkSeq($seq);
       
   708     if( !wantarray ) {
       
   709         return  $me->[_Idx][ $off ]
       
   710             -   $me->[_Idx][ $off + _Min ];
       
   711     }
       
   712     return
       
   713         @{$me->[$seq]}[
       
   714                 $me->[_Idx][ $off + _Min ]
       
   715             ..  ( $me->[_Idx][ $off ] - 1 )
       
   716         ];
       
   717 }
       
   718 
       
   719 sub Same {
       
   720     my( $me )= @_;
       
   721     $me->_ChkPos();
       
   722     return wantarray ? () : 0
       
   723         if  $me->[_Same] != ( 1 & $me->[_Pos] );
       
   724     return $me->Items(1);
       
   725 }
       
   726 
       
   727 my %getName;
       
   728 BEGIN {
       
   729     %getName= (
       
   730         same => \&Same,
       
   731         diff => \&Diff,
       
   732         base => \&Base,
       
   733         min  => \&Min,
       
   734         max  => \&Max,
       
   735         range=> \&Range,
       
   736         items=> \&Items, # same thing
       
   737     );
       
   738 }
       
   739 
       
   740 sub Get
       
   741 {
       
   742     my $me= shift @_;
       
   743     $me->_ChkPos();
       
   744     my @value;
       
   745     for my $arg (  @_  ) {
       
   746         for my $word (  split ' ', $arg  ) {
       
   747             my $meth;
       
   748             if(     $word !~ /^(-?\d+)?([a-zA-Z]+)([12])?$/
       
   749                 ||  not  $meth= $getName{ lc $2 }
       
   750             ) {
       
   751                 Die( $Root, ", Get: Invalid request ($word)" );
       
   752             }
       
   753             my( $base, $name, $seq )= ( $1, $2, $3 );
       
   754             push @value, scalar(
       
   755                 4 == length($name)
       
   756                     ? $meth->( $me )
       
   757                     : $meth->( $me, $seq, $base )
       
   758             );
       
   759         }
       
   760     }
       
   761     if(  wantarray  ) {
       
   762         return @value;
       
   763     } elsif(  1 == @value  ) {
       
   764         return $value[0];
       
   765     }
       
   766     Die( 0+@value, " values requested from ",
       
   767         $Root, "'s Get in scalar context" );
       
   768 }
       
   769 
       
   770 
       
   771 my $Obj= getObjPkg($Root);
       
   772 no strict 'refs';
       
   773 
       
   774 for my $meth (  qw( new getObjPkg )  ) {
       
   775     *{$Root."::".$meth} = \&{$meth};
       
   776     *{$Obj ."::".$meth} = \&{$meth};
       
   777 }
       
   778 for my $meth (  qw(
       
   779     Next Prev Reset Copy Base Diff
       
   780     Same Items Range Min Max Get
       
   781     _ChkPos _ChkSeq
       
   782 )  ) {
       
   783     *{$Obj."::".$meth} = \&{$meth};
       
   784 }
       
   785 
       
   786 1;
       
   787 __END__
       
   788 
       
   789 =head1 NAME
       
   790 
       
   791 Algorithm::Diff - Compute `intelligent' differences between two files / lists
       
   792 
       
   793 =head1 SYNOPSIS
       
   794 
       
   795     require Algorithm::Diff;
       
   796 
       
   797     # This example produces traditional 'diff' output:
       
   798 
       
   799     my $diff = Algorithm::Diff->new( \@seq1, \@seq2 );
       
   800 
       
   801     $diff->Base( 1 );   # Return line numbers, not indices
       
   802     while(  $diff->Next()  ) {
       
   803         next   if  $diff->Same();
       
   804         my $sep = '';
       
   805         if(  ! $diff->Items(2)  ) {
       
   806             sprintf "%d,%dd%d\n",
       
   807                 $diff->Get(qw( Min1 Max1 Max2 ));
       
   808         } elsif(  ! $diff->Items(1)  ) {
       
   809             sprint "%da%d,%d\n",
       
   810                 $diff->Get(qw( Max1 Min2 Max2 ));
       
   811         } else {
       
   812             $sep = "---\n";
       
   813             sprintf "%d,%dc%d,%d\n",
       
   814                 $diff->Get(qw( Min1 Max1 Min2 Max2 ));
       
   815         }
       
   816         print "< $_"   for  $diff->Items(1);
       
   817         print $sep;
       
   818         print "> $_"   for  $diff->Items(2);
       
   819     }
       
   820 
       
   821 
       
   822     # Alternate interfaces:
       
   823 
       
   824     use Algorithm::Diff qw(
       
   825         LCS LCS_length LCSidx
       
   826         diff sdiff compact_diff
       
   827         traverse_sequences traverse_balanced );
       
   828 
       
   829     @lcs    = LCS( \@seq1, \@seq2 );
       
   830     $lcsref = LCS( \@seq1, \@seq2 );
       
   831     $count  = LCS_length( \@seq1, \@seq2 );
       
   832 
       
   833     ( $seq1idxref, $seq2idxref ) = LCSidx( \@seq1, \@seq2 );
       
   834 
       
   835 
       
   836     # Complicated interfaces:
       
   837 
       
   838     @diffs  = diff( \@seq1, \@seq2 );
       
   839 
       
   840     @sdiffs = sdiff( \@seq1, \@seq2 );
       
   841 
       
   842     @cdiffs = compact_diff( \@seq1, \@seq2 );
       
   843 
       
   844     traverse_sequences(
       
   845         \@seq1,
       
   846         \@seq2,
       
   847         {   MATCH     => \&callback1,
       
   848             DISCARD_A => \&callback2,
       
   849             DISCARD_B => \&callback3,
       
   850         },
       
   851         \&key_generator,
       
   852         @extra_args,
       
   853     );
       
   854 
       
   855     traverse_balanced(
       
   856         \@seq1,
       
   857         \@seq2,
       
   858         {   MATCH     => \&callback1,
       
   859             DISCARD_A => \&callback2,
       
   860             DISCARD_B => \&callback3,
       
   861             CHANGE    => \&callback4,
       
   862         },
       
   863         \&key_generator,
       
   864         @extra_args,
       
   865     );
       
   866 
       
   867 
       
   868 =head1 INTRODUCTION
       
   869 
       
   870 (by Mark-Jason Dominus)
       
   871 
       
   872 I once read an article written by the authors of C<diff>; they said
       
   873 that they worked very hard on the algorithm until they found the
       
   874 right one.
       
   875 
       
   876 I think what they ended up using (and I hope someone will correct me,
       
   877 because I am not very confident about this) was the `longest common
       
   878 subsequence' method.  In the LCS problem, you have two sequences of
       
   879 items:
       
   880 
       
   881     a b c d f g h j q z
       
   882 
       
   883     a b c d e f g i j k r x y z
       
   884 
       
   885 and you want to find the longest sequence of items that is present in
       
   886 both original sequences in the same order.  That is, you want to find
       
   887 a new sequence I<S> which can be obtained from the first sequence by
       
   888 deleting some items, and from the secend sequence by deleting other
       
   889 items.  You also want I<S> to be as long as possible.  In this case I<S>
       
   890 is
       
   891 
       
   892     a b c d f g j z
       
   893 
       
   894 From there it's only a small step to get diff-like output:
       
   895 
       
   896     e   h i   k   q r x y
       
   897     +   - +   +   - + + +
       
   898 
       
   899 This module solves the LCS problem.  It also includes a canned function
       
   900 to generate C<diff>-like output.
       
   901 
       
   902 It might seem from the example above that the LCS of two sequences is
       
   903 always pretty obvious, but that's not always the case, especially when
       
   904 the two sequences have many repeated elements.  For example, consider
       
   905 
       
   906     a x b y c z p d q
       
   907     a b c a x b y c z
       
   908 
       
   909 A naive approach might start by matching up the C<a> and C<b> that
       
   910 appear at the beginning of each sequence, like this:
       
   911 
       
   912     a x b y c         z p d q
       
   913     a   b   c a b y c z
       
   914 
       
   915 This finds the common subsequence C<a b c z>.  But actually, the LCS
       
   916 is C<a x b y c z>:
       
   917 
       
   918           a x b y c z p d q
       
   919     a b c a x b y c z
       
   920 
       
   921 or
       
   922 
       
   923     a       x b y c z p d q
       
   924     a b c a x b y c z
       
   925 
       
   926 =head1 USAGE
       
   927 
       
   928 (See also the README file and several example
       
   929 scripts include with this module.)
       
   930 
       
   931 This module now provides an object-oriented interface that uses less
       
   932 memory and is easier to use than most of the previous procedural
       
   933 interfaces.  It also still provides several exportable functions.  We'll
       
   934 deal with these in ascending order of difficulty:  C<LCS>,
       
   935 C<LCS_length>, C<LCSidx>, OO interface, C<prepare>, C<diff>, C<sdiff>,
       
   936 C<traverse_sequences>, and C<traverse_balanced>.
       
   937 
       
   938 =head2 C<LCS>
       
   939 
       
   940 Given references to two lists of items, LCS returns an array containing
       
   941 their longest common subsequence.  In scalar context, it returns a
       
   942 reference to such a list.
       
   943 
       
   944     @lcs    = LCS( \@seq1, \@seq2 );
       
   945     $lcsref = LCS( \@seq1, \@seq2 );
       
   946 
       
   947 C<LCS> may be passed an optional third parameter; this is a CODE
       
   948 reference to a key generation function.  See L</KEY GENERATION
       
   949 FUNCTIONS>.
       
   950 
       
   951     @lcs    = LCS( \@seq1, \@seq2, \&keyGen, @args );
       
   952     $lcsref = LCS( \@seq1, \@seq2, \&keyGen, @args );
       
   953 
       
   954 Additional parameters, if any, will be passed to the key generation
       
   955 routine.
       
   956 
       
   957 =head2 C<LCS_length>
       
   958 
       
   959 This is just like C<LCS> except it only returns the length of the
       
   960 longest common subsequence.  This provides a performance gain of about
       
   961 9% compared to C<LCS>.
       
   962 
       
   963 =head2 C<LCSidx>
       
   964 
       
   965 Like C<LCS> except it returns references to two arrays.  The first array
       
   966 contains the indices into @seq1 where the LCS items are located.  The
       
   967 second array contains the indices into @seq2 where the LCS items are located.
       
   968 
       
   969 Therefore, the following three lists will contain the same values:
       
   970 
       
   971     my( $idx1, $idx2 ) = LCSidx( \@seq1, \@seq2 );
       
   972     my @list1 = @seq1[ @$idx1 ];
       
   973     my @list2 = @seq2[ @$idx2 ];
       
   974     my @list3 = LCS( \@seq1, \@seq2 );
       
   975 
       
   976 =head2 C<new>
       
   977 
       
   978     $diff = Algorithm::Diffs->new( \@seq1, \@seq2 );
       
   979     $diff = Algorithm::Diffs->new( \@seq1, \@seq2, \%opts );
       
   980 
       
   981 C<new> computes the smallest set of additions and deletions necessary
       
   982 to turn the first sequence into the second and compactly records them
       
   983 in the object.
       
   984 
       
   985 You use the object to iterate over I<hunks>, where each hunk represents
       
   986 a contiguous section of items which should be added, deleted, replaced,
       
   987 or left unchanged.
       
   988 
       
   989 =over 4
       
   990 
       
   991 The following summary of all of the methods looks a lot like Perl code
       
   992 but some of the symbols have different meanings:
       
   993 
       
   994     [ ]     Encloses optional arguments
       
   995     :       Is followed by the default value for an optional argument
       
   996     |       Separates alternate return results
       
   997 
       
   998 Method summary:
       
   999 
       
  1000     $obj        = Algorithm::Diff->new( \@seq1, \@seq2, [ \%opts ] );
       
  1001     $pos        = $obj->Next(  [ $count : 1 ] );
       
  1002     $revPos     = $obj->Prev(  [ $count : 1 ] );
       
  1003     $obj        = $obj->Reset( [ $pos : 0 ] );
       
  1004     $copy       = $obj->Copy(  [ $pos, [ $newBase ] ] );
       
  1005     $oldBase    = $obj->Base(  [ $newBase ] );
       
  1006 
       
  1007 Note that all of the following methods C<die> if used on an object that
       
  1008 is "reset" (not currently pointing at any hunk).
       
  1009 
       
  1010     $bits       = $obj->Diff(  );
       
  1011     @items|$cnt = $obj->Same(  );
       
  1012     @items|$cnt = $obj->Items( $seqNum );
       
  1013     @idxs |$cnt = $obj->Range( $seqNum, [ $base ] );
       
  1014     $minIdx     = $obj->Min(   $seqNum, [ $base ] );
       
  1015     $maxIdx     = $obj->Max(   $seqNum, [ $base ] );
       
  1016     @values     = $obj->Get(   @names );
       
  1017 
       
  1018 Passing in C<undef> for an optional argument is always treated the same
       
  1019 as if no argument were passed in.
       
  1020 
       
  1021 =item C<Next>
       
  1022 
       
  1023     $pos = $diff->Next();    # Move forward 1 hunk
       
  1024     $pos = $diff->Next( 2 ); # Move forward 2 hunks
       
  1025     $pos = $diff->Next(-5);  # Move backward 5 hunks
       
  1026 
       
  1027 C<Next> moves the object to point at the next hunk.  The object starts
       
  1028 out "reset", which means it isn't pointing at any hunk.  If the object
       
  1029 is reset, then C<Next()> moves to the first hunk.
       
  1030 
       
  1031 C<Next> returns a true value iff the move didn't go past the last hunk.
       
  1032 So C<Next(0)> will return true iff the object is not reset.
       
  1033 
       
  1034 Actually, C<Next> returns the object's new position, which is a number
       
  1035 between 1 and the number of hunks (inclusive), or returns a false value.
       
  1036 
       
  1037 =item C<Prev>
       
  1038 
       
  1039 C<Prev($N)> is almost identical to C<Next(-$N)>; it moves to the $Nth
       
  1040 previous hunk.  On a 'reset' object, C<Prev()> [and C<Next(-1)>] move
       
  1041 to the last hunk.
       
  1042 
       
  1043 The position returned by C<Prev> is relative to the I<end> of the
       
  1044 hunks; -1 for the last hunk, -2 for the second-to-last, etc.
       
  1045 
       
  1046 =item C<Reset>
       
  1047 
       
  1048     $diff->Reset();     # Reset the object's position
       
  1049     $diff->Reset($pos); # Move to the specified hunk
       
  1050     $diff->Reset(1);    # Move to the first hunk
       
  1051     $diff->Reset(-1);   # Move to the last hunk
       
  1052 
       
  1053 C<Reset> returns the object, so, for example, you could use
       
  1054 C<< $diff->Reset()->Next(-1) >> to get the number of hunks.
       
  1055 
       
  1056 =item C<Copy>
       
  1057 
       
  1058     $copy = $diff->Copy( $newPos, $newBase );
       
  1059 
       
  1060 C<Copy> returns a copy of the object.  The copy and the orignal object
       
  1061 share most of their data, so making copies takes very little memory.
       
  1062 The copy maintains its own position (separate from the original), which
       
  1063 is the main purpose of copies.  It also maintains its own base.
       
  1064 
       
  1065 By default, the copy's position starts out the same as the original
       
  1066 object's position.  But C<Copy> takes an optional first argument to set the
       
  1067 new position, so the following three snippets are equivalent:
       
  1068 
       
  1069     $copy = $diff->Copy($pos);
       
  1070 
       
  1071     $copy = $diff->Copy();
       
  1072     $copy->Reset($pos);
       
  1073 
       
  1074     $copy = $diff->Copy()->Reset($pos);
       
  1075 
       
  1076 C<Copy> takes an optional second argument to set the base for
       
  1077 the copy.  If you wish to change the base of the copy but leave
       
  1078 the position the same as in the original, here are two
       
  1079 equivalent ways:
       
  1080 
       
  1081     $copy = $diff->Copy();
       
  1082     $copy->Base( 0 );
       
  1083 
       
  1084     $copy = $diff->Copy(undef,0);
       
  1085 
       
  1086 Here are two equivalent way to get a "reset" copy:
       
  1087 
       
  1088     $copy = $diff->Copy(0);
       
  1089 
       
  1090     $copy = $diff->Copy()->Reset();
       
  1091 
       
  1092 =item C<Diff>
       
  1093 
       
  1094     $bits = $obj->Diff();
       
  1095 
       
  1096 C<Diff> returns a true value iff the current hunk contains items that are
       
  1097 different between the two sequences.  It actually returns one of the
       
  1098 follow 4 values:
       
  1099 
       
  1100 =over 4
       
  1101 
       
  1102 =item 3
       
  1103 
       
  1104 C<3==(1|2)>.  This hunk contains items from @seq1 and the items
       
  1105 from @seq2 that should replace them.  Both sequence 1 and 2
       
  1106 contain changed items so both the 1 and 2 bits are set.
       
  1107 
       
  1108 =item 2
       
  1109 
       
  1110 This hunk only contains items from @seq2 that should be inserted (not
       
  1111 items from @seq1).  Only sequence 2 contains changed items so only the 2
       
  1112 bit is set.
       
  1113 
       
  1114 =item 1
       
  1115 
       
  1116 This hunk only contains items from @seq1 that should be deleted (not
       
  1117 items from @seq2).  Only sequence 1 contains changed items so only the 1
       
  1118 bit is set.
       
  1119 
       
  1120 =item 0
       
  1121 
       
  1122 This means that the items in this hunk are the same in both sequences.
       
  1123 Neither sequence 1 nor 2 contain changed items so neither the 1 nor the
       
  1124 2 bits are set.
       
  1125 
       
  1126 =back
       
  1127 
       
  1128 =item C<Same>
       
  1129 
       
  1130 C<Same> returns a true value iff the current hunk contains items that
       
  1131 are the same in both sequences.  It actually returns the list of items
       
  1132 if they are the same or an emty list if they aren't.  In a scalar
       
  1133 context, it returns the size of the list.
       
  1134 
       
  1135 =item C<Items>
       
  1136 
       
  1137     $count = $diff->Items(2);
       
  1138     @items = $diff->Items($seqNum);
       
  1139 
       
  1140 C<Items> returns the (number of) items from the specified sequence that
       
  1141 are part of the current hunk.
       
  1142 
       
  1143 If the current hunk contains only insertions, then
       
  1144 C<< $diff->Items(1) >> will return an empty list (0 in a scalar conext).
       
  1145 If the current hunk contains only deletions, then C<< $diff->Items(2) >>
       
  1146 will return an empty list (0 in a scalar conext).
       
  1147 
       
  1148 If the hunk contains replacements, then both C<< $diff->Items(1) >> and
       
  1149 C<< $diff->Items(2) >> will return different, non-empty lists.
       
  1150 
       
  1151 Otherwise, the hunk contains identical items and all of the following
       
  1152 will return the same lists:
       
  1153 
       
  1154     @items = $diff->Items(1);
       
  1155     @items = $diff->Items(2);
       
  1156     @items = $diff->Same();
       
  1157 
       
  1158 =item C<Range>
       
  1159 
       
  1160     $count = $diff->Range( $seqNum );
       
  1161     @indices = $diff->Range( $seqNum );
       
  1162     @indices = $diff->Range( $seqNum, $base );
       
  1163 
       
  1164 C<Range> is like C<Items> except that it returns a list of I<indices> to
       
  1165 the items rather than the items themselves.  By default, the index of
       
  1166 the first item (in each sequence) is 0 but this can be changed by
       
  1167 calling the C<Base> method.  So, by default, the following two snippets
       
  1168 return the same lists:
       
  1169 
       
  1170     @list = $diff->Items(2);
       
  1171     @list = @seq2[ $diff->Range(2) ];
       
  1172 
       
  1173 You can also specify the base to use as the second argument.  So the
       
  1174 following two snippets I<always> return the same lists:
       
  1175 
       
  1176     @list = $diff->Items(1);
       
  1177     @list = @seq1[ $diff->Range(1,0) ];
       
  1178 
       
  1179 =item C<Base>
       
  1180 
       
  1181     $curBase = $diff->Base();
       
  1182     $oldBase = $diff->Base($newBase);
       
  1183 
       
  1184 C<Base> sets and/or returns the current base (usually 0 or 1) that is
       
  1185 used when you request range information.  The base defaults to 0 so
       
  1186 that range information is returned as array indices.  You can set the
       
  1187 base to 1 if you want to report traditional line numbers instead.
       
  1188 
       
  1189 =item C<Min>
       
  1190 
       
  1191     $min1 = $diff->Min(1);
       
  1192     $min = $diff->Min( $seqNum, $base );
       
  1193 
       
  1194 C<Min> returns the first value that C<Range> would return (given the
       
  1195 same arguments) or returns C<undef> if C<Range> would return an empty
       
  1196 list.
       
  1197 
       
  1198 =item C<Max>
       
  1199 
       
  1200 C<Max> returns the last value that C<Range> would return or C<undef>.
       
  1201 
       
  1202 =item C<Get>
       
  1203 
       
  1204     ( $n, $x, $r ) = $diff->Get(qw( min1 max1 range1 ));
       
  1205     @values = $diff->Get(qw( 0min2 1max2 range2 same base ));
       
  1206 
       
  1207 C<Get> returns one or more scalar values.  You pass in a list of the
       
  1208 names of the values you want returned.  Each name must match one of the
       
  1209 following regexes:
       
  1210 
       
  1211     /^(-?\d+)?(min|max)[12]$/i
       
  1212     /^(range[12]|same|diff|base)$/i
       
  1213 
       
  1214 The 1 or 2 after a name says which sequence you want the information
       
  1215 for (and where allowed, it is required).  The optional number before
       
  1216 "min" or "max" is the base to use.  So the following equalities hold:
       
  1217 
       
  1218     $diff->Get('min1') == $diff->Min(1)
       
  1219     $diff->Get('0min2') == $diff->Min(2,0)
       
  1220 
       
  1221 Using C<Get> in a scalar context when you've passed in more than one
       
  1222 name is a fatal error (C<die> is called).
       
  1223 
       
  1224 =back
       
  1225 
       
  1226 =head2 C<prepare>
       
  1227 
       
  1228 Given a reference to a list of items, C<prepare> returns a reference
       
  1229 to a hash which can be used when comparing this sequence to other
       
  1230 sequences with C<LCS> or C<LCS_length>.
       
  1231 
       
  1232     $prep = prepare( \@seq1 );
       
  1233     for $i ( 0 .. 10_000 )
       
  1234     {
       
  1235         @lcs = LCS( $prep, $seq[$i] );
       
  1236         # do something useful with @lcs
       
  1237     }
       
  1238 
       
  1239 C<prepare> may be passed an optional third parameter; this is a CODE
       
  1240 reference to a key generation function.  See L</KEY GENERATION
       
  1241 FUNCTIONS>.
       
  1242 
       
  1243     $prep = prepare( \@seq1, \&keyGen );
       
  1244     for $i ( 0 .. 10_000 )
       
  1245     {
       
  1246         @lcs = LCS( $seq[$i], $prep, \&keyGen );
       
  1247         # do something useful with @lcs
       
  1248     }
       
  1249 
       
  1250 Using C<prepare> provides a performance gain of about 50% when calling LCS
       
  1251 many times compared with not preparing.
       
  1252 
       
  1253 =head2 C<diff>
       
  1254 
       
  1255     @diffs     = diff( \@seq1, \@seq2 );
       
  1256     $diffs_ref = diff( \@seq1, \@seq2 );
       
  1257 
       
  1258 C<diff> computes the smallest set of additions and deletions necessary
       
  1259 to turn the first sequence into the second, and returns a description
       
  1260 of these changes.  The description is a list of I<hunks>; each hunk
       
  1261 represents a contiguous section of items which should be added,
       
  1262 deleted, or replaced.  (Hunks containing unchanged items are not
       
  1263 included.)
       
  1264 
       
  1265 The return value of C<diff> is a list of hunks, or, in scalar context, a
       
  1266 reference to such a list.  If there are no differences, the list will be
       
  1267 empty.
       
  1268 
       
  1269 Here is an example.  Calling C<diff> for the following two sequences:
       
  1270 
       
  1271     a b c e h j l m n p
       
  1272     b c d e f j k l m r s t
       
  1273 
       
  1274 would produce the following list:
       
  1275 
       
  1276     (
       
  1277       [ [ '-', 0, 'a' ] ],
       
  1278 
       
  1279       [ [ '+', 2, 'd' ] ],
       
  1280 
       
  1281       [ [ '-', 4, 'h' ],
       
  1282         [ '+', 4, 'f' ] ],
       
  1283 
       
  1284       [ [ '+', 6, 'k' ] ],
       
  1285 
       
  1286       [ [ '-',  8, 'n' ],
       
  1287         [ '-',  9, 'p' ],
       
  1288         [ '+',  9, 'r' ],
       
  1289         [ '+', 10, 's' ],
       
  1290         [ '+', 11, 't' ] ],
       
  1291     )
       
  1292 
       
  1293 There are five hunks here.  The first hunk says that the C<a> at
       
  1294 position 0 of the first sequence should be deleted (C<->).  The second
       
  1295 hunk says that the C<d> at position 2 of the second sequence should
       
  1296 be inserted (C<+>).  The third hunk says that the C<h> at position 4
       
  1297 of the first sequence should be removed and replaced with the C<f>
       
  1298 from position 4 of the second sequence.  And so on.
       
  1299 
       
  1300 C<diff> may be passed an optional third parameter; this is a CODE
       
  1301 reference to a key generation function.  See L</KEY GENERATION
       
  1302 FUNCTIONS>.
       
  1303 
       
  1304 Additional parameters, if any, will be passed to the key generation
       
  1305 routine.
       
  1306 
       
  1307 =head2 C<sdiff>
       
  1308 
       
  1309     @sdiffs     = sdiff( \@seq1, \@seq2 );
       
  1310     $sdiffs_ref = sdiff( \@seq1, \@seq2 );
       
  1311 
       
  1312 C<sdiff> computes all necessary components to show two sequences
       
  1313 and their minimized differences side by side, just like the
       
  1314 Unix-utility I<sdiff> does:
       
  1315 
       
  1316     same             same
       
  1317     before     |     after
       
  1318     old        <     -
       
  1319     -          >     new
       
  1320 
       
  1321 It returns a list of array refs, each pointing to an array of
       
  1322 display instructions. In scalar context it returns a reference
       
  1323 to such a list. If there are no differences, the list will have one
       
  1324 entry per item, each indicating that the item was unchanged.
       
  1325 
       
  1326 Display instructions consist of three elements: A modifier indicator
       
  1327 (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
       
  1328 C<c>: Element changed) and the value of the old and new elements, to
       
  1329 be displayed side-by-side.
       
  1330 
       
  1331 An C<sdiff> of the following two sequences:
       
  1332 
       
  1333     a b c e h j l m n p
       
  1334     b c d e f j k l m r s t
       
  1335 
       
  1336 results in
       
  1337 
       
  1338     ( [ '-', 'a', ''  ],
       
  1339       [ 'u', 'b', 'b' ],
       
  1340       [ 'u', 'c', 'c' ],
       
  1341       [ '+', '',  'd' ],
       
  1342       [ 'u', 'e', 'e' ],
       
  1343       [ 'c', 'h', 'f' ],
       
  1344       [ 'u', 'j', 'j' ],
       
  1345       [ '+', '',  'k' ],
       
  1346       [ 'u', 'l', 'l' ],
       
  1347       [ 'u', 'm', 'm' ],
       
  1348       [ 'c', 'n', 'r' ],
       
  1349       [ 'c', 'p', 's' ],
       
  1350       [ '+', '',  't' ],
       
  1351     )
       
  1352 
       
  1353 C<sdiff> may be passed an optional third parameter; this is a CODE
       
  1354 reference to a key generation function.  See L</KEY GENERATION
       
  1355 FUNCTIONS>.
       
  1356 
       
  1357 Additional parameters, if any, will be passed to the key generation
       
  1358 routine.
       
  1359 
       
  1360 =head2 C<compact_diff>
       
  1361 
       
  1362 C<compact_diff> is much like C<sdiff> except it returns a much more
       
  1363 compact description consisting of just one flat list of indices.  An
       
  1364 example helps explain the format:
       
  1365 
       
  1366     my @a = qw( a b c   e  h j   l m n p      );
       
  1367     my @b = qw(   b c d e f  j k l m    r s t );
       
  1368     @cdiff = compact_diff( \@a, \@b );
       
  1369     # Returns:
       
  1370     #   @a      @b       @a       @b
       
  1371     #  start   start   values   values
       
  1372     (    0,      0,   #       =
       
  1373          0,      0,   #    a  !
       
  1374          1,      0,   #  b c  =  b c
       
  1375          3,      2,   #       !  d
       
  1376          3,      3,   #    e  =  e
       
  1377          4,      4,   #    f  !  h
       
  1378          5,      5,   #    j  =  j
       
  1379          6,      6,   #       !  k
       
  1380          6,      7,   #  l m  =  l m
       
  1381          8,      9,   #  n p  !  r s t
       
  1382         10,     12,   #
       
  1383     );
       
  1384 
       
  1385 The 0th, 2nd, 4th, etc. entries are all indices into @seq1 (@a in the
       
  1386 above example) indicating where a hunk begins.  The 1st, 3rd, 5th, etc.
       
  1387 entries are all indices into @seq2 (@b in the above example) indicating
       
  1388 where the same hunk begins.
       
  1389 
       
  1390 So each pair of indices (except the last pair) describes where a hunk
       
  1391 begins (in each sequence).  Since each hunk must end at the item just
       
  1392 before the item that starts the next hunk, the next pair of indices can
       
  1393 be used to determine where the hunk ends.
       
  1394 
       
  1395 So, the first 4 entries (0..3) describe the first hunk.  Entries 0 and 1
       
  1396 describe where the first hunk begins (and so are always both 0).
       
  1397 Entries 2 and 3 describe where the next hunk begins, so subtracting 1
       
  1398 from each tells us where the first hunk ends.  That is, the first hunk
       
  1399 contains items C<$diff[0]> through C<$diff[2] - 1> of the first sequence
       
  1400 and contains items C<$diff[1]> through C<$diff[3] - 1> of the second
       
  1401 sequence.
       
  1402 
       
  1403 In other words, the first hunk consists of the following two lists of items:
       
  1404 
       
  1405                #  1st pair     2nd pair
       
  1406                # of indices   of indices
       
  1407     @list1 = @a[ $cdiff[0] .. $cdiff[2]-1 ];
       
  1408     @list2 = @b[ $cdiff[1] .. $cdiff[3]-1 ];
       
  1409                # Hunk start   Hunk end
       
  1410 
       
  1411 Note that the hunks will always alternate between those that are part of
       
  1412 the LCS (those that contain unchanged items) and those that contain
       
  1413 changes.  This means that all we need to be told is whether the first
       
  1414 hunk is a 'same' or 'diff' hunk and we can determine which of the other
       
  1415 hunks contain 'same' items or 'diff' items.
       
  1416 
       
  1417 By convention, we always make the first hunk contain unchanged items.
       
  1418 So the 1st, 3rd, 5th, etc. hunks (all odd-numbered hunks if you start
       
  1419 counting from 1) all contain unchanged items.  And the 2nd, 4th, 6th,
       
  1420 etc. hunks (all even-numbered hunks if you start counting from 1) all
       
  1421 contain changed items.
       
  1422 
       
  1423 Since @a and @b don't begin with the same value, the first hunk in our
       
  1424 example is empty (otherwise we'd violate the above convention).  Note
       
  1425 that the first 4 index values in our example are all zero.  Plug these
       
  1426 values into our previous code block and we get:
       
  1427 
       
  1428     @hunk1a = @a[ 0 .. 0-1 ];
       
  1429     @hunk1b = @b[ 0 .. 0-1 ];
       
  1430 
       
  1431 And C<0..-1> returns the empty list.
       
  1432 
       
  1433 Move down one pair of indices (2..5) and we get the offset ranges for
       
  1434 the second hunk, which contains changed items.
       
  1435 
       
  1436 Since C<@diff[2..5]> contains (0,0,1,0) in our example, the second hunk
       
  1437 consists of these two lists of items:
       
  1438 
       
  1439         @hunk2a = @a[ $cdiff[2] .. $cdiff[4]-1 ];
       
  1440         @hunk2b = @b[ $cdiff[3] .. $cdiff[5]-1 ];
       
  1441     # or
       
  1442         @hunk2a = @a[ 0 .. 1-1 ];
       
  1443         @hunk2b = @b[ 0 .. 0-1 ];
       
  1444     # or
       
  1445         @hunk2a = @a[ 0 .. 0 ];
       
  1446         @hunk2b = @b[ 0 .. -1 ];
       
  1447     # or
       
  1448         @hunk2a = ( 'a' );
       
  1449         @hunk2b = ( );
       
  1450 
       
  1451 That is, we would delete item 0 ('a') from @a.
       
  1452 
       
  1453 Since C<@diff[4..7]> contains (1,0,3,2) in our example, the third hunk
       
  1454 consists of these two lists of items:
       
  1455 
       
  1456         @hunk3a = @a[ $cdiff[4] .. $cdiff[6]-1 ];
       
  1457         @hunk3a = @b[ $cdiff[5] .. $cdiff[7]-1 ];
       
  1458     # or
       
  1459         @hunk3a = @a[ 1 .. 3-1 ];
       
  1460         @hunk3a = @b[ 0 .. 2-1 ];
       
  1461     # or
       
  1462         @hunk3a = @a[ 1 .. 2 ];
       
  1463         @hunk3a = @b[ 0 .. 1 ];
       
  1464     # or
       
  1465         @hunk3a = qw( b c );
       
  1466         @hunk3a = qw( b c );
       
  1467 
       
  1468 Note that this third hunk contains unchanged items as our convention demands.
       
  1469 
       
  1470 You can continue this process until you reach the last two indices,
       
  1471 which will always be the number of items in each sequence.  This is
       
  1472 required so that subtracting one from each will give you the indices to
       
  1473 the last items in each sequence.
       
  1474 
       
  1475 =head2 C<traverse_sequences>
       
  1476 
       
  1477 C<traverse_sequences> used to be the most general facility provided by
       
  1478 this module (the new OO interface is more powerful and much easier to
       
  1479 use).
       
  1480 
       
  1481 Imagine that there are two arrows.  Arrow A points to an element of
       
  1482 sequence A, and arrow B points to an element of the sequence B. 
       
  1483 Initially, the arrows point to the first elements of the respective
       
  1484 sequences.  C<traverse_sequences> will advance the arrows through the
       
  1485 sequences one element at a time, calling an appropriate user-specified
       
  1486 callback function before each advance.  It willadvance the arrows in
       
  1487 such a way that if there are equal elements C<$A[$i]> and C<$B[$j]>
       
  1488 which are equal and which are part of the LCS, there will be some moment
       
  1489 during the execution of C<traverse_sequences> when arrow A is pointing
       
  1490 to C<$A[$i]> and arrow B is pointing to C<$B[$j]>.  When this happens,
       
  1491 C<traverse_sequences> will call the C<MATCH> callback function and then
       
  1492 it will advance both arrows.
       
  1493 
       
  1494 Otherwise, one of the arrows is pointing to an element of its sequence
       
  1495 that is not part of the LCS.  C<traverse_sequences> will advance that
       
  1496 arrow and will call the C<DISCARD_A> or the C<DISCARD_B> callback,
       
  1497 depending on which arrow it advanced.  If both arrows point to elements
       
  1498 that are not part of the LCS, then C<traverse_sequences> will advance
       
  1499 one of them and call the appropriate callback, but it is not specified
       
  1500 which it will call.
       
  1501 
       
  1502 The arguments to C<traverse_sequences> are the two sequences to
       
  1503 traverse, and a hash which specifies the callback functions, like this:
       
  1504 
       
  1505     traverse_sequences(
       
  1506         \@seq1, \@seq2,
       
  1507         {   MATCH => $callback_1,
       
  1508             DISCARD_A => $callback_2,
       
  1509             DISCARD_B => $callback_3,
       
  1510         }
       
  1511     );
       
  1512 
       
  1513 Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least
       
  1514 the indices of the two arrows as their arguments.  They are not expected
       
  1515 to return any values.  If a callback is omitted from the table, it is
       
  1516 not called.
       
  1517 
       
  1518 Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
       
  1519 corresponding index in A or B.
       
  1520 
       
  1521 If arrow A reaches the end of its sequence, before arrow B does,
       
  1522 C<traverse_sequences> will call the C<A_FINISHED> callback when it
       
  1523 advances arrow B, if there is such a function; if not it will call
       
  1524 C<DISCARD_B> instead.  Similarly if arrow B finishes first. 
       
  1525 C<traverse_sequences> returns when both arrows are at the ends of their
       
  1526 respective sequences.  It returns true on success and false on failure. 
       
  1527 At present there is no way to fail.
       
  1528 
       
  1529 C<traverse_sequences> may be passed an optional fourth parameter; this
       
  1530 is a CODE reference to a key generation function.  See L</KEY GENERATION
       
  1531 FUNCTIONS>.
       
  1532 
       
  1533 Additional parameters, if any, will be passed to the key generation function.
       
  1534 
       
  1535 If you want to pass additional parameters to your callbacks, but don't
       
  1536 need a custom key generation function, you can get the default by
       
  1537 passing undef:
       
  1538 
       
  1539     traverse_sequences(
       
  1540         \@seq1, \@seq2,
       
  1541         {   MATCH => $callback_1,
       
  1542             DISCARD_A => $callback_2,
       
  1543             DISCARD_B => $callback_3,
       
  1544         },
       
  1545         undef,     # default key-gen
       
  1546         $myArgument1,
       
  1547         $myArgument2,
       
  1548         $myArgument3,
       
  1549     );
       
  1550 
       
  1551 C<traverse_sequences> does not have a useful return value; you are
       
  1552 expected to plug in the appropriate behavior with the callback
       
  1553 functions.
       
  1554 
       
  1555 =head2 C<traverse_balanced>
       
  1556 
       
  1557 C<traverse_balanced> is an alternative to C<traverse_sequences>. It
       
  1558 uses a different algorithm to iterate through the entries in the
       
  1559 computed LCS. Instead of sticking to one side and showing element changes
       
  1560 as insertions and deletions only, it will jump back and forth between
       
  1561 the two sequences and report I<changes> occurring as deletions on one
       
  1562 side followed immediatly by an insertion on the other side.
       
  1563 
       
  1564 In addition to the C<DISCARD_A>, C<DISCARD_B>, and C<MATCH> callbacks
       
  1565 supported by C<traverse_sequences>, C<traverse_balanced> supports
       
  1566 a C<CHANGE> callback indicating that one element got C<replaced> by another:
       
  1567 
       
  1568     traverse_balanced(
       
  1569         \@seq1, \@seq2,
       
  1570         {   MATCH => $callback_1,
       
  1571             DISCARD_A => $callback_2,
       
  1572             DISCARD_B => $callback_3,
       
  1573             CHANGE    => $callback_4,
       
  1574         }
       
  1575     );
       
  1576 
       
  1577 If no C<CHANGE> callback is specified, C<traverse_balanced>
       
  1578 will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
       
  1579 therefore resulting in a similar behaviour as C<traverse_sequences>
       
  1580 with different order of events.
       
  1581 
       
  1582 C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
       
  1583 noticable only while processing huge amounts of data.
       
  1584 
       
  1585 The C<sdiff> function of this module 
       
  1586 is implemented as call to C<traverse_balanced>.
       
  1587 
       
  1588 C<traverse_balanced> does not have a useful return value; you are expected to
       
  1589 plug in the appropriate behavior with the callback functions.
       
  1590 
       
  1591 =head1 KEY GENERATION FUNCTIONS
       
  1592 
       
  1593 Most of the functions accept an optional extra parameter.  This is a
       
  1594 CODE reference to a key generating (hashing) function that should return
       
  1595 a string that uniquely identifies a given element.  It should be the
       
  1596 case that if two elements are to be considered equal, their keys should
       
  1597 be the same (and the other way around).  If no key generation function
       
  1598 is provided, the key will be the element as a string.
       
  1599 
       
  1600 By default, comparisons will use "eq" and elements will be turned into keys
       
  1601 using the default stringizing operator '""'.
       
  1602 
       
  1603 Where this is important is when you're comparing something other than
       
  1604 strings.  If it is the case that you have multiple different objects
       
  1605 that should be considered to be equal, you should supply a key
       
  1606 generation function. Otherwise, you have to make sure that your arrays
       
  1607 contain unique references.
       
  1608 
       
  1609 For instance, consider this example:
       
  1610 
       
  1611     package Person;
       
  1612 
       
  1613     sub new
       
  1614     {
       
  1615         my $package = shift;
       
  1616         return bless { name => '', ssn => '', @_ }, $package;
       
  1617     }
       
  1618 
       
  1619     sub clone
       
  1620     {
       
  1621         my $old = shift;
       
  1622         my $new = bless { %$old }, ref($old);
       
  1623     }
       
  1624 
       
  1625     sub hash
       
  1626     {
       
  1627         return shift()->{'ssn'};
       
  1628     }
       
  1629 
       
  1630     my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
       
  1631     my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
       
  1632     my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
       
  1633     my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
       
  1634     my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
       
  1635 
       
  1636 If you did this:
       
  1637 
       
  1638     my $array1 = [ $person1, $person2, $person4 ];
       
  1639     my $array2 = [ $person1, $person3, $person4, $person5 ];
       
  1640     Algorithm::Diff::diff( $array1, $array2 );
       
  1641 
       
  1642 everything would work out OK (each of the objects would be converted
       
  1643 into a string like "Person=HASH(0x82425b0)" for comparison).
       
  1644 
       
  1645 But if you did this:
       
  1646 
       
  1647     my $array1 = [ $person1, $person2, $person4 ];
       
  1648     my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
       
  1649     Algorithm::Diff::diff( $array1, $array2 );
       
  1650 
       
  1651 $person4 and $person4->clone() (which have the same name and SSN)
       
  1652 would be seen as different objects. If you wanted them to be considered
       
  1653 equivalent, you would have to pass in a key generation function:
       
  1654 
       
  1655     my $array1 = [ $person1, $person2, $person4 ];
       
  1656     my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
       
  1657     Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
       
  1658 
       
  1659 This would use the 'ssn' field in each Person as a comparison key, and
       
  1660 so would consider $person4 and $person4->clone() as equal.
       
  1661 
       
  1662 You may also pass additional parameters to the key generation function
       
  1663 if you wish.
       
  1664 
       
  1665 =head1 ERROR CHECKING
       
  1666 
       
  1667 If you pass these routines a non-reference and they expect a reference,
       
  1668 they will die with a message.
       
  1669 
       
  1670 =head1 AUTHOR
       
  1671 
       
  1672 This version released by Tye McQueen (http://perlmonks.org/?node=tye).
       
  1673 
       
  1674 =head1 LICENSE
       
  1675 
       
  1676 Parts Copyright (c) 2000-2004 Ned Konz.  All rights reserved.
       
  1677 Parts by Tye McQueen.
       
  1678 
       
  1679 This program is free software; you can redistribute it and/or modify it
       
  1680 under the same terms as Perl.
       
  1681 
       
  1682 =head1 MAILING LIST
       
  1683 
       
  1684 Mark-Jason still maintains a mailing list.  To join a low-volume mailing
       
  1685 list for announcements related to diff and Algorithm::Diff, send an
       
  1686 empty mail message to mjd-perl-diff-request@plover.com.
       
  1687 
       
  1688 =head1 CREDITS
       
  1689 
       
  1690 Versions through 0.59 (and much of this documentation) were written by:
       
  1691 
       
  1692 Mark-Jason Dominus, mjd-perl-diff@plover.com
       
  1693 
       
  1694 This version borrows some documentation and routine names from
       
  1695 Mark-Jason's, but Diff.pm's code was completely replaced.
       
  1696 
       
  1697 This code was adapted from the Smalltalk code of Mario Wolczko
       
  1698 <mario@wolczko.com>, which is available at
       
  1699 ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
       
  1700 
       
  1701 C<sdiff> and C<traverse_balanced> were written by Mike Schilli
       
  1702 <m@perlmeister.com>.
       
  1703 
       
  1704 The algorithm is that described in
       
  1705 I<A Fast Algorithm for Computing Longest Common Subsequences>,
       
  1706 CACM, vol.20, no.5, pp.350-353, May 1977, with a few
       
  1707 minor improvements to improve the speed.
       
  1708 
       
  1709 Much work was done by Ned Konz (perl@bike-nomad.com).
       
  1710 
       
  1711 The OO interface and some other changes are by Tye McQueen.
       
  1712 
       
  1713 =cut