#!/usr/bin/env perl use Getopt::Std; use warnings; use strict; # flush output linebuffered $|=1; # ------------------------------------------------------------------- # globals # ------------------------------------------------------------------- our %args; our $platform; our @rules=(); our $taggercommand; our $filename = "/tmp/input.$$"; our $mapfile; our $threshold; our @input; our @output; # ------------------------------------------------------------------- # read map # ------------------------------------------------------------------- # read mapping rules sub readmap{ open(MAP, "< $mapfile") or die "Couldn't open map $mapfile for reading: $!\n"; while(my $l=){ my %rtps; chomp($l); #continue if line is empty next if $l=~ m/^\s*$/; #filter comments next if $l=~ m/^\#/; # parse rule $l=~ m/^\s*(\S*\s*\S+)\s*->\s*(.+)/; my $le=$1; my $ri=$2; # whylikethis??? # parse left side my $w=""; my $t; $le =~m/((\S+)\s+)?(\S+)/; $w=$2; $t=$3; # parse right side if(!$threshold){ $ri=~ m/(\S+)\s*/; $rtps{$1}=1; }else{ while($ri=~ m/(\S+)(\s+)?((\d+(.\d+)?(e[\-\+]\d*)?)?)?/g){ if($3){ $rtps{$1}=$3; } else{ $rtps{$1}=1.0; } } } my %r=(word=>$w, tag=>$t, rtps=>\%rtps); push @rules,\%r; #&printrule(\%r); } close(MAP); } # ------------------------------------------------------------------- # printrule # ------------------------------------------------------------------- sub printrule(){ my $r=$_[0]; if($r->{word}){ printf "rule: %s %s -> ",$r->{word}, $r->{tag}; }else{ printf "rule: %s -> ", $r->{tag}; } my $rtps=$r->{rtps}; foreach my $rtp(keys %$rtps){ printf"%s %e ", $rtp, $rtps->{$rtp}; } print "\n"; } # ------------------------------------------------------------------- # chop - this function parses stdin # ------------------------------------------------------------------- sub chop{ my $rest; my $l=$_[0]; my $w=""; my $t=""; my $p=0; my %tps; if($l=~m/(\S+)\s+(.+)/){ $w=$1; $rest=$2; while($rest=~m/(\S+)(\s+)?(\d+.\d+e[\-\+]\d*)?/g){ $t=$1; $p=($3) ? $3 : 1; # set p to $3 if $3 is defined, else set p to 1 $tps{$t}=$p; } #print "chop $w "; &printtagprobs(\@tps); my %r=(word => $w, tagprobs => \%tps); push @input, \%r; #&printline(\%r); } } # ------------------------------------------------------------------- # printline # ------------------------------------------------------------------- sub printline(){ my $lr=$_[0]; printf "%s ", $lr->{word}; my $tps = $lr->{tagprobs}; if($threshold){ foreach my $tp(keys %$tps){ printf"%s %e ", $tp, $tps->{$tp}; } } else{ foreach my $tp(keys %$tps){ printf"%s", $tp; } } print "\n"; } # ------------------------------------------------------------------- # printtagprobs # ------------------------------------------------------------------- sub printtagprobs(){ my $tps=$_[0]; foreach my $tp(keys %$tps){ printf"%s %e ", $tp, $tps->{$tp}; } print "\n"; } # ------------------------------------------------------------------- # mergehashes # ------------------------------------------------------------------- sub mergehashes{ my $a=$_[0]; my $b=$_[1]; my $c=$_[2]; foreach my $kv ( $a, $b ) { while ((my $k, my $v) = each %$kv) { if (exists $c->{$k}) { #printf "key exists, adding values\n \$v %f \$c{%s} %f\n", $v, $k, $c->{$k}; $c->{$k}+=$v; } else { $c->{$k}=$v; } } } } # ------------------------------------------------------------------- # map # ------------------------------------------------------------------- sub map { my $l; foreach $l(@input){ my $tps = $l->{tagprobs}; my %otps; my $tp; #tp: a tag probabiltity pair from current line foreach $tp (keys %$tps){ # must be a list of hashes because a single pair of tag and prob # might be replaced by a list of pairs my %otp; foreach my $rl(@rules){ # for(my $i=0; $i<=$#rules; $i++){ # my $rl=$rules[$i]; #printf STDERR "trying\n";&printrule($rules[$i]);print"in ";&printline($l); # check if the rule specified word matches the word if specified if(!$rl->{word} or ($rl->{word} eq $l->{word})){ #print "DB: pass wordtest ruletag $rl->{tag} linetag $tp\n"; if($rl->{tag} eq $tp){ #print "DB: pass tagtest\n"; # multiply rule probabilities with tnt probabilities foreach my $k(keys %{$rl->{rtps}}){ $otp{$k}=$tps->{$tp} * $rl->{rtps}->{$k}; } # to avoid prefix problems (one rule could result in a valid prefix for # another), jump to next tag-prob pair last; } } } # add current tag-probability pair to output tag probabilities if no rule matched my @kotp=keys %otp; if($#kotp==-1){ $otp{$tp}=$tps->{$tp} } # merge %otp (tp-pairs resulting from this rule) and %otps (tp-pairs resulting # from previous rules) and add probabilities if a key (tag) exists in both my %result; &mergehashes(\%otp, \%otps, \%result); %otps=%result; #print STDERR "pushed "; &printtagprobs(\%otp); } # -create @output from tp-pairs # -add elements of @rules[$i]->rtps push(@output, {word=> $l->{word}, tagprobs=>\%otps}); &printline($output[$#output]); } # join double tp-pairs in @input->{tagprob} normalize corresponding probabilities # print @ input to stdout # printf STDOUT "%s\n", $l; } # ------------------------------------------------------------------- # open_file # ------------------------------------------------------------------- sub open_file() { unlink($filename); open(FILE,">$filename") or die "Can't write to temp file!\n"; } # ------------------------------------------------------------------- # main # ------------------------------------------------------------------- getopts("l:z:m:c", \%args); $platform = `uname`; chomp($platform); # command to invoke the tagger $threshold = (defined $args{z} && $args{z}>=0) ? "-z$args{z} " : ""; my $tagger = $platform eq "SunOS" && -x "/opt/pkg/tnt/tnt" ? "/opt/pkg/tnt/tnt" : $platform eq "Darwin" ? "/Users/jochen/Documents/diplom/Tools/tnt-fake.pl" : $platform eq "Linux" && -x "/data/linux/opt/tnt/tnt" ? "/data/linux/opt/tnt/tnt" : "tnt"; die "no model specified\n" if !defined $args{l}; die "no map specified\n" if !defined $args{m}; my $model = $args{l}; $mapfile = $args{m}; $taggercommand.=$tagger; $taggercommand.=" -v0 $threshold"; $taggercommand.=" $model"; $taggercommand.=" $filename"; &readmap; open_file(); # this while loop first writes to file FILE # then runs the tagger on the file. the inner loop # processes the output of the tagger. open (M, ">map.log"); while (<>) { print FILE; # if STDIN reaches an empty line and option c is undefined, # close FILE and tag it. otherwise wait until STDIN is completely # copied to FILE before running the tagger on FILE if((/^$/ && !(defined $args{c})) or (eof)) { close(FILE); # open forks and then execs the taggercommand from the shell. # the tagger process results which receives the same STDIN # we need not to worry about controlling it's input, because # we don't do anything with it. the pipe symbol links its # STDOUT with our STDIN open(T, "$taggercommand |") or die "couldn't open the tagger ($taggercommand)\n"; # read STDOUT of the Tagger and map while (my $l=) { &chop($l); } close(T); open_file(); next; } } unlink($filename); ↦ # for (my $i=0; $i<=$#input; $i++){ # #&printline($input[$i]); # &printline($output[$i]); #} close(M);