#!/usr/bin/perl # Copyright (c) 2014, University of Southern California # All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. use strict; $| = 1; my $TRAIN = 0; my $NUMTR = 0; my %wdep = (); my @actarr = (); # if the -t option is used, do training if( $ARGV[0] eq "-t" ) { $TRAIN = 1; shift; $NUMTR = shift; } my $useclusters = 0; my %clusters = (); if( $ARGV[0] eq "-c" ) { shift; my $cfname = shift; open FP, "$cfname" or die "Cannot open cluster file $cfname\n"; while( ) { my @arr = split; $clusters{$arr[0]} = $arr[1]; } close FP; $useclusters = 1; } if( !$TRAIN ) { my $modelname = shift; open FP, "$modelname" or die "Cannot open model $modelname\n"; my $clstr = ; @actarr = split ' ', $clstr; shift @actarr; while() { my @arr = split; my $fname = shift @arr; my $i = 0; foreach my $fw (@arr) { if( $fw != 0 ) { ${$wdep{$fname}}{$i} = $fw; } $i++; } } close FP; } sub classify { my @arr = @_; my @res = (); push @arr, "**BIAS**"; foreach my $f (@arr) { if (exists $wdep{$f}) { for (my $i = 0; $i < @actarr; $i++) { if( exists $wdep{$f}{$i} ) { $res[$i] += $wdep{$f}{$i}; } } } } my @fres = (); for (my $i = 0; $i < @res; $i++) { $fres[$i] = { cname => $actarr[$i], weight => $res[$i], }; } @fres = sort {$b->{weight} <=> $a->{weight}} @fres; my $bestlabel = $fres[0]->{cname}; if ($bestlabel eq "") { $bestlabel = "NONE"; } return $bestlabel; } sub printconll { my @s = @_; for( my $i = 1; $i < @s; $i++ ) { print "$s[$i]->{idx}\t"; print "$s[$i]->{word}\t"; print "$s[$i]->{lemma}\t"; print "$s[$i]->{cpos}\t"; print "$s[$i]->{pos}\t"; print "$s[$i]->{morph}\t"; print "$s[$i]->{link}\t"; print "$s[$i]->{label}\n"; } print "\n"; } sub conllReadSentence { my $str = ; my @sent = (); push @sent, { idx => 0, word => "LeftWall", lemma => "leftwall", cpos => "lw", pos => "LW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; # skip blank lines while( $str =~ /^$/ ) { if( !( $str = ) ) { return (); } } do { my @arr = split ' ', $str; if( @arr < 3 ) { return @sent; } push @sent, { idx => $arr[0], word => $arr[1], lemma => $arr[2], cpos => $arr[3], pos => $arr[4], morph => $arr[5], glink => $arr[6], glabel => $arr[7], link => 0, label => "_", }; } while( $str = ); if( @sent > 1 ) { return @sent; } return (); } sub mkfeats { my $i = shift; my $j = shift; my @s = @_; my $iprev2 = { idx => 0, word => "LeftWall", lemma => "leftwall", cpos => "lw", pos => "LW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; my $iprev1 = { idx => 0, word => "LeftWall", lemma => "leftwall", cpos => "lw", pos => "LW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; my $inext1 = { idx => 0, word => "RightWall", lemma => "rightwall", cpos => "rw", pos => "RW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; my $inext2 = { idx => 0, word => "RightWall", lemma => "rightwall", cpos => "rw", pos => "RW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; my $iw = $s[$i]; my $jw = $s[$j]; if( $i > 1 ) { $iprev1 = $s[$i-1]; } if( $i > 2 ) { $iprev1 = $s[$i-2]; } if( $i < @s - 1 ) { $inext1 = $s[$i+1]; } if( $i < @s - 2 ) { $inext2 = $s[$i+2]; } my $jprev2 = { idx => 0, word => "LeftWall", lemma => "leftwall", cpos => "lw", pos => "LW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; my $jprev1 = { idx => 0, word => "LeftWall", lemma => "leftwall", cpos => "lw", pos => "LW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; my $jnext1 = { idx => 0, word => "RightWall", lemma => "rightwall", cpos => "rw", pos => "RW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; my $jnext2 = { idx => 0, word => "RightWall", lemma => "rightwall", cpos => "rw", pos => "RW", morph => "_", glink => 0, glabel => "_", link => 0, label => "_", }; if( $j > 1 ) { $jprev1 = $s[$j-1]; } if( $j > 2 ) { $jprev1 = $s[$j-2]; } if( $j < @s - 1 ) { $jnext1 = $s[$j+1]; } if( $j < @s - 2 ) { $jnext2 = $s[$j+2]; } my $jh = $s[$jw->{glink}]; my @feats = (); my $dir = "L"; if( $i > $j ) { $dir = "R"; } my $dist = abs($i - $j); if( $dist > 7 ) { $dist = 7; } elsif( $dist > 5 ) { $dist = 5; } push @feats, "$dir"; push @feats, "iw:$iw->{word}"; push @feats, "iwpos:$iw->{pos}"; push @feats, "iwcpos:$iw->{cpos}"; push @feats, "iwmorph:$iw->{morph}"; push @feats, "jw:$jw->{word}"; push @feats, "jwpos:$jw->{pos}"; push @feats, "jwcpos:$jw->{cpos}"; push @feats, "jwcpos:$jw->{morph}"; push @feats, "iwmjwm:$iw->{morph}:$jw->{morph}"; my @marr = split '\|', $iw->{morph}; foreach my $m ( @marr ) { push @feats, "im:$m"; } push @feats, "ip1:$iprev1->{word}"; push @feats, "ip1pos:$iprev1->{pos}"; push @feats, "ip2:$iprev2->{word}"; push @feats, "ip2pos:$iprev2->{pos}"; push @feats, "in1:$inext1->{word}"; push @feats, "in1pos:$inext1->{pos}"; push @feats, "in2:$inext2->{word}"; push @feats, "in2pos:$inext2->{pos}"; my $n = @feats; for( my $a = 0; $a < $n-1; $a++ ) { push @feats, "$feats[$a]~ipos:$iw->{pos}"; push @feats, "$feats[$a]~jpos:$jw->{pos}"; push @feats, "$feats[$a]~ipos:$iw->{cpos}"; push @feats, "$feats[$a]~jpos:$jw->{cpos}"; push @feats, "$feats[$a]~dir:$dir"; if( $useclusters ) { push @feats, "c:$feats[$a]~ic:$clusters{$iw->{word}}"; } } if( $useclusters ) { push @feats, "$clusters{$iw->{word}}:$clusters{$jw->{word}}"; } return @feats; } my $numtr = 0; sub parse { my $train = shift; my @s = @_; my @res = @s; if( 1 ) { for( my $i = 1; $i < @s; $i++ ) { $s[$i]->{link} = $s[$i]->{glink}; $s[$i]->{lchpos} = ":"; $s[$i]->{rchpos} = ":"; $s[$i]->{chpos} = ":"; $s[$i]->{numout} = 0; } for( my $i = 1; $i < @s; $i++ ) { $s[$s[$i]->{glink}]->{numout}++; $s[$s[$i]->{glink}]->{chpos} .= "$s[$i]->{pos}:"; if( $i > $s[$i]->{glink} ) { $s[$s[$i]->{glink}]->{lchpos} .= "$s[$i]->{pos}:"; } else { $s[$s[$i]->{glink}]->{rchpos} .= "$s[$i]->{pos}:"; } } } for( my $i = 1; $i < @s; $i++ ) { my $j = $s[$i]->{glink}; my @farr = &mkfeats( $i, $j, @s ); if( $train ) { $numtr++; if( $numtr == $NUMTR ) { print "DEV\n"; } my $fstr = join " ", @farr; print "$s[$i]->{glabel} $fstr\n"; } else { my $ans = &classify( @farr ); $s[$i]->{label} = $ans; } } if( !$train ) { &printconll( @res ); } } sub mainLoop { while( my @s = &conllReadSentence ) { &parse( $TRAIN, @s ); } } &mainLoop();