#!/usr/bin/env python

# Copyright (c) 2014, U Chun Lao All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:

# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.

# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


# construct the best lattice path from the nodisamb file given the gold std as oracle

import sys, heapq, argparse

parser=argparse.ArgumentParser()
parser.add_argument('-gold', '-g', required=True, help='gold standard file')
parser.add_argument('-latt', '-l', required=True, help='raw lattice file')
parser.add_argument('-out', '-o', help='output file')

args = parser.parse_args(sys.argv[1:])

# gold standard file
goldF = args.gold

# raw lattice file
lattF = args.latt

# output file
outF = args.out


# read gold stnadard

gold = []
buff = []

for l in open(goldF, 'r'):
	if len(l.strip()) > 0:
		buff.append(l.replace('\n', '').split()[1:6])
	else:
		gold.append(buff)
		buff = []

print 'read in %d gold standards' % len(gold)

# read lattices

latt = []

for l in open(lattF, 'r'):
	if len(l.strip()) > 0:
		spt = l.replace('\n', '').split()
		head = int(spt[0])
		tail = int(spt[1])
		# skip if it is a cycle
		if head == tail:
			continue
		buff.append((head, tail, spt[2:7]))
	else:
		latt.append(buff)
		buff = []

print 'read in %d lattices' % len(latt)


# return the number of fields that match between gold and candidate,
# 0 if word does not match
def matchTkn(gold, cand):
	score = -2
	if gold[0] == cand[0]:
		score = 0
		for f in range(len(gold)-1):
			if gold[f+1] == cand[f+1]:
				score += 1
	return score


# select best lattice path given the gold standard
def select(gold, latt):
	# create gold standard indexing
	orig = {}
	
	glen = 0
	for g in gold:
		orig[glen] = g
		glen += len(g[0])

	# create lattice look up table
	nodes = {}
	endNode = 0

	for l in latt:
		if l[1] > endNode:
			endNode = l[1]
		if l[0] in nodes:
			nodes[l[0]].append((l[1], l[2]))
		else:
			nodes[l[0]] = [(l[1], l[2])]


	# search for best path
	ends = {}

	# stores paths into heap with their ending vertex
	# (number of edges that match the gold, path, current length)
	ends[0] = [(0, ['-ROOT-\t_\t_\t_\t_'], 0)]

	
	current = 0
	broken = []

	for current in range(endNode):
		# in case we get a broken lattice
		if current not in ends:
			continue

		# take the best incoming path that ends at current
		best = heapq.heappop(ends.pop(current))

		broken = []
		newBest = 0

		# find the path that best matches the gold standard
		while newBest == 0 or len(broken) > 0:
			bestPath = []
			bestSc = -1

			if best[2] in orig:
				goldNode = orig[best[2]]
				for n in range(len(nodes[current])):
					if nodes[current][n][0] in broken:
						continue
					score = matchTkn(goldNode, nodes[current][n][1])
					if score > bestSc:
						bestSc = score
						bestPath = [n]
					elif score == bestSc:
						bestPath.append(n)

			
			# expand the best path
			if len(bestPath) > 0:
				for i in range(len(bestPath)):
					bestNode = nodes[current][bestPath[i]]
					nextEnd = bestNode[0]

					if nextEnd != endNode and nextEnd not in nodes:
						broken.append(nextEnd)
						continue

					if nextEnd in ends:
						heapq.heappush(ends[nextEnd], (best[0] + 1, \
							best[1] + ['\t'.join(bestNode[1])], \
							best[2] + len(bestNode[1][0])))
					else:
						ends[nextEnd]=[(best[0] + 1, \
							best[1] + ['\t'.join(bestNode[1])], \
							best[2] + len(bestNode[1][0]))]

					newBest += 1
			# if none of the paths matches, continue with all
			else:
				for n in nodes[current]:
					if n[0] != endNode and n[0] not in nodes:
						broken.append(n[0])
						continue
					newBest += 1
					if n[0] in ends:
						heapq.heappush(ends[n[0]], (best[0], \
							best[1] + ['\t'.join(n[1])], \
							best[2] + len(n[1][0])))
					else:
						ends[n[0]] = [(best[0], \
							best[1] + ['\t'.join(n[1])], \
							best[2] + len(n[1][0]))]

			if newBest > 0:
				broken = []

	# return the best path
	return heapq.heappop(ends[endNode])


# end of select


# select and write best path from lattices to file
with open(outF, 'w') as f:
	for i in range(len(gold)):
		sys.stdout.write('\rprogress: %d / %d' % (i, len(gold)))
		best = select(gold[i], latt[i])
		for j in range(len(best[1])-1):
			f.write('%d\t%s\t_\t_\t_\t_\n' % (j+1, best[1][j+1]))
		f.write('\n')

sys.stdout.write('\rprogress: %d / %d\n' % (i+1, len(gold)))