package task_hash; use strict; use warnings; use Digest::MD5 qw(md5_hex); my @keywords = qw( auto break case char const continue default do double else enum extern float for goto if int long register return short signed sizeof static struct switch typedef union unsigned void volatile while printf scanf include stdio math stdlib malloc calloc realloc free bool catch class const_cast delete dynamic_cast explicit false friend inline mutable namespace new operator private protected public reinterpret_cast static_cast template this throw true try typeid typename using virtual wchar_t cons car cdr atom null list defun lambda setq eval funcall apply format princ if cond unless eq eql equal equalp let loop nil as case of class data default deriving do forall foreign hiding if then else import infix infixl infixr instance let in mdo module newtype qualified type where ); # Usage: $hashref = hash($sourcecode); sub hash($) { my ($contents) = @_; my %keywords; map { $keywords{$_} = 1; } @keywords; my %result; $result{source} = md5_hex($contents); $contents =~ s/\/\*.*?\*\///gs; # I know that's not exactly correct $contents =~ s/\/\/.*//g; $contents =~ s/\b/ /g; $contents =~ s/(\W)(?=\W)/$1 /g; $contents =~ s/[\s\n\r]+/ /g; $result{compressed} = md5_hex($contents); my %dictionary; my $counter = 0; while ($contents =~ /\b([a-z]\w*)\b/gi) { next if defined $keywords{$1}; $dictionary{$1} = sprintf "ID%05d", ++$counter; } foreach my $word (keys %dictionary) { $contents =~ s/\b$word\b/$dictionary{$word}/g; } $result{encoded} = md5_hex($contents); my @contents = split /\s+/, $contents; my $fingerprints = ''; my @fplist; foreach my $token (@contents) { my $value = (defined $keywords{$token} ? 'k' : ($token =~ /^ID/ ? 'i' : ($token =~ /^\d+$/ ? 'n' : $token))); $fingerprints .= $value; push @fplist, $value; } $result{fingerprints} = $fingerprints; $result{fplist} = \@fplist; return \%result; } 1;