#! /usr/bin/env python # Copyright (c) 2014, U Chun Lao All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # extract the best lattice path for each sentence based on the score import sys, numpy, argparse parser = argparse.ArgumentParser() parser.add_argument('-latt', '-l', default=False, required=True, help='lattice paths file') parser.add_argument('-group', '-g', default=False, required=True, \ help='file that divides the lattice file into sentence groups') parser.add_argument('-score', '-s', default=False, required=True, \ help='file that contains scores for each lattice path in LATT') parser.add_argument('-output', '-o', default=False, \ help='file that the best lattice paths will be written to') args=parser.parse_args(sys.argv[1:]) # lattice paths file lattF = args.latt # file that divides the lattice file into sentence groups groupF = args.group # file that the best lattice paths will be written to bestF = output # file that contains scores for each lattice path in lattF scoreF = score # read lattices, groupings and scores lattCount = [int(l.split('\t')[2]) for l in open(groupF, 'r')] print 'read in %d groups' % len(lattCount) scores = [] for l in open(scoreF, 'r'): if '(AD3)' in l: scores.append(float(l.split()[-1])) print 'read in %d scores' % len(scores) latt = [] buff = [] gbuff = [] total = 0 scid = 0 for l in open(lattF, 'r'): if len(l.strip()) > 0: buff.append(l.replace('\n', '')) else: gbuff.append((scores[scid], buff)) buff = [] scid += 1 total += 1 if len(gbuff) >= lattCount[len(latt)]: latt.append(gbuff) gbuff = [] print 'read in %d sentences in %d groups' % (total, len(latt)) # process scores bestPath = [] goldBest = 0 for l in latt: best = l[0] gold = True for cand in l[1:]: if cand[0] > best[0]: best = cand gold = False bestPath.append(best) if gold: goldBest += 1 print '%d gold standards are rated as the best' % (goldBest) if bestF: with open(bestF, 'w') as f: for b in bestPath: for l in b[1]: f.write('%s\n' % l) f.write('\n')