#!/usr/bin/perl -w use strict; use Data::Dumper; my $K = 10; my $PRE = 3; my $POST = 0; my $ITER = 30; my @text = (); my @chopped = (); while (<>) { push @chopped, split /([a-zA-Z']+)/; tr/A-Z/a-z/; s/[?!:;,]/./g; s/[^a-z'.]/ /g; s/\./ . /g; s/\s+/ /g; s/'//g; push @text, split; } my %count = (); for (@text) { $count{$_}++; } my @words = sort keys %count; sub iter { my ($vec) = @_; # compute counts my $count = {}; for my $i ($PRE..$#text - $POST) { my $pre = join (",", map {$vec->{$text[$i-$_]}} (1..$PRE)); my $post = join (",", map {$vec->{$text[$i+$_]}} (1..$POST)); my $this = $vec->{$text[$i]}; $count->{"$pre;$post"}{$this}++; } # count total label frequencies for regularization my $tf = {}; for my $x (keys %$count) { for my $k (keys %{$count->{$x}}) { $tf->{$k} += $count->{$x}{$k}; } } # normalize to regularized "log probability distribution" my $param = {}; for my $x (keys %$count) { for my $k (keys %{$count->{$x}}) { $param->{$x}{$k} = log ((1 + $count->{$x}{$k}) / (1 + $tf->{$k})); } } # find "likelihoods" my $prob = {}; for my $i ($PRE..$#text - $POST) { my $pre = join (",", map {$vec->{$text[$i-$_]}} (1..$PRE)); my $post = join (",", map {$vec->{$text[$i+$_]}} (1..$POST)); my $probs = $param->{"$pre;$post"}; for my $k (keys %$probs) { $prob->{$text[$i]}{$k} += $probs->{$k}; } } # compute "max-likelihood" labelling my $newvec = {}; for (keys %$prob) { $newvec->{$_} = argmin($prob->{$_}); } return $newvec; } sub argmin { my ($hash) = @_; my ($minv, $mink) = (1e9, ""); for (keys %$hash) { if ($hash->{$_} < $minv) { $mink = $_; $minv = $hash->{$_}; } } return $mink; } # for (sort {$count{$b} <=> $count{$a}} @words) { # print "$_: $count{$_}\n"; # } my $vec = {map {($_ => int(rand($K)))} @words}; for (0..$ITER-1) { print STDERR "$_...\n"; $vec = iter($vec); } print <<'EOF'; EOF for (@chopped) { s/\n/
\n/g; my $key = $_; $key =~ tr/A-Z/a-z/; $key =~ s/'//g; if (defined $vec->{$key}) { print "$_"; } else { print "$_"; } } print "";