#!/usr/bin/env python # Copyright (c) 2014, U Chun Lao All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # construct the best lattice path from the nodisamb file given the gold std as oracle import sys, heapq, argparse parser=argparse.ArgumentParser() parser.add_argument('-gold', '-g', required=True, help='gold standard file') parser.add_argument('-latt', '-l', required=True, help='raw lattice file') parser.add_argument('-out', '-o', help='output file') args = parser.parse_args(sys.argv[1:]) # gold standard file goldF = args.gold # raw lattice file lattF = args.latt # output file outF = args.out # read gold stnadard gold = [] buff = [] for l in open(goldF, 'r'): if len(l.strip()) > 0: buff.append(l.replace('\n', '').split()[1:6]) else: gold.append(buff) buff = [] print 'read in %d gold standards' % len(gold) # read lattices latt = [] for l in open(lattF, 'r'): if len(l.strip()) > 0: spt = l.replace('\n', '').split() head = int(spt[0]) tail = int(spt[1]) # skip if it is a cycle if head == tail: continue buff.append((head, tail, spt[2:7])) else: latt.append(buff) buff = [] print 'read in %d lattices' % len(latt) # return the number of fields that match between gold and candidate, # 0 if word does not match def matchTkn(gold, cand): score = -2 if gold[0] == cand[0]: score = 0 for f in range(len(gold)-1): if gold[f+1] == cand[f+1]: score += 1 return score # select best lattice path given the gold standard def select(gold, latt): # create gold standard indexing orig = {} glen = 0 for g in gold: orig[glen] = g glen += len(g[0]) # create lattice look up table nodes = {} endNode = 0 for l in latt: if l[1] > endNode: endNode = l[1] if l[0] in nodes: nodes[l[0]].append((l[1], l[2])) else: nodes[l[0]] = [(l[1], l[2])] # search for best path ends = {} # stores paths into heap with their ending vertex # (number of edges that match the gold, path, current length) ends[0] = [(0, ['-ROOT-\t_\t_\t_\t_'], 0)] current = 0 broken = [] for current in range(endNode): # in case we get a broken lattice if current not in ends: continue # take the best incoming path that ends at current best = heapq.heappop(ends.pop(current)) broken = [] newBest = 0 # find the path that best matches the gold standard while newBest == 0 or len(broken) > 0: bestPath = [] bestSc = -1 if best[2] in orig: goldNode = orig[best[2]] for n in range(len(nodes[current])): if nodes[current][n][0] in broken: continue score = matchTkn(goldNode, nodes[current][n][1]) if score > bestSc: bestSc = score bestPath = [n] elif score == bestSc: bestPath.append(n) # expand the best path if len(bestPath) > 0: for i in range(len(bestPath)): bestNode = nodes[current][bestPath[i]] nextEnd = bestNode[0] if nextEnd != endNode and nextEnd not in nodes: broken.append(nextEnd) continue if nextEnd in ends: heapq.heappush(ends[nextEnd], (best[0] + 1, \ best[1] + ['\t'.join(bestNode[1])], \ best[2] + len(bestNode[1][0]))) else: ends[nextEnd]=[(best[0] + 1, \ best[1] + ['\t'.join(bestNode[1])], \ best[2] + len(bestNode[1][0]))] newBest += 1 # if none of the paths matches, continue with all else: for n in nodes[current]: if n[0] != endNode and n[0] not in nodes: broken.append(n[0]) continue newBest += 1 if n[0] in ends: heapq.heappush(ends[n[0]], (best[0], \ best[1] + ['\t'.join(n[1])], \ best[2] + len(n[1][0]))) else: ends[n[0]] = [(best[0], \ best[1] + ['\t'.join(n[1])], \ best[2] + len(n[1][0]))] if newBest > 0: broken = [] # return the best path return heapq.heappop(ends[endNode]) # end of select # select and write best path from lattices to file with open(outF, 'w') as f: for i in range(len(gold)): sys.stdout.write('\rprogress: %d / %d' % (i, len(gold))) best = select(gold[i], latt[i]) for j in range(len(best[1])-1): f.write('%d\t%s\t_\t_\t_\t_\n' % (j+1, best[1][j+1])) f.write('\n') sys.stdout.write('\rprogress: %d / %d\n' % (i+1, len(gold)))