#! /usr/bin/env python # Copyright (c) 2014, U Chun Lao All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # train log regression model import sys import string import os import pickle import numpy import argparse from sklearn import linear_model, feature_extraction parser = argparse.ArgumentParser() parser.add_argument('-inputF', '-input', required=True, help='input lattice file') parser.add_argument('-logReg', required=True, help='log regression model file') parser.add_argument('-enco', '-e', required=True, help='log regression model encoding') parser.add_argument('-vect', '-v', required=True, help='log regression model vectorizer') parser.add_argument('-logErr', '-logtostderr', default=False, action='store_true', \ help='log to stderr') args = parser.parse_args(sys.argv[1:]) # model file logReg = args.logReg # perc = False # encoding file enco = args.enco # vectorizer file vect = args.vect # input file inputF = args.inputF # log to stderr logErr = args.logErr encoding = {} X = [] y = [] model = linear_model.LogisticRegression() # encode the given list of features, return the encoded features, if new # entry added to encoding dictionary, also return the new dict, False o/w def encode(feat, encoding): rtn = [] addNew = False for f in feat: if f in encoding: rtn.append(encoding[f]) else: rtn.append(len(encoding)) encoding[f] = len(encoding) addNew = True if addNew: return rtn, encoding else: return rtn, False # seperate features and encode them def encode2(inFeat, encoding): feat = {} extFeat = [] extList = [0, 3] crossFeat = {} addNew = False for i in range(len(inFeat)): sptFeat = inFeat[i].split() # seperate morpological features morFeat = sptFeat[4].split('|') if '=' in sptFeat[4] else [] # for m in mor: # morFeat.append(m.split('=')) # spt, newEnc = encode(sptFeat[:4] + [m[1] for m in morFeat], encoding) spt, newEnc = encode(sptFeat[:4] + morFeat, encoding) if newEnc: encoding = newEnc addNew = True # store extracted features for later use extFeat.append([sptFeat[j] for j in extList]) feat['word-%d' % i] = spt[0] feat['lemma-%d' % i] = spt[1] feat['cpos-%d' % i] = spt[2] feat['pos-%d' % i] = spt[3] # process morpological features for j in range(len(morFeat)): feat['mor-%d-%d' % (j, i)] = morFeat[j] # print feat['mor-%d-%d' % (j, i)] # add combined features for i in range(len(inFeat)): for j in range(len(inFeat)): if i==j: continue for k1 in range(len(extFeat[i])): k2 = k1 while k2 < len(extFeat[j]): crossFeat['cross-%d,%d|%d,%d' % (i,k1,j,k2)] = '%s|%s' % (extFeat[i][k1] , extFeat[j][k2]) k2 += 1 keys = crossFeat.keys() spt, newEnc = encode([crossFeat[c] for c in keys], encoding) if newEnc: encoding = newEnc addNew = True for i in range(len(crossFeat)): feat[keys[i]] = spt[i] return feat, (encoding if addNew else False) # end of encode # read input if inputF: for l in open(inputF, 'r'): spt = l.replace('\n', '').split(' ') if len(spt) < 3: continue y.append(float(spt[0])) encFeat, newEnc = encode2((spt[1], spt[2]), encoding) if newEnc: encoding = newEnc X.append(encFeat) else: for l in sys.stdin: spt = l.replace('\n', '').split(' ') if len(spt) < 3: continue y.append(float(spt[0])) encFeat, newEnc = encode2((spt[1], spt[2]), encoding) if newEnc: encoding = newEnc X.append(encFeat) if logErr: sys.stderr.write('Read in %d entries, with %d distinct features\n' % \ (len(X), len(encoding))) # transform training data list into data vector dv = feature_extraction.DictVectorizer() vecX = dv.fit_transform(X) if logErr: sys.stderr.write('Successfully vectorize %d training examples\n' % len(X)) # fit model to data model.fit(vecX, y) score = model.score(vecX, y) if logErr: sys.stderr.write('Successfully fit log regression model to data with score %f\n'\ % (score)) if logReg: with open(logReg, 'wb') as modelF: pickle.dump(model, modelF) if logErr: sys.stderr.write('Write logistic regression model to %s\n' % logReg) if vect: with open(vect, 'wb') as vectF: pickle.dump(dv, vectF) if logErr: sys.stderr.write('Write vectorizer to %s\n' % vect) if enco: with open(enco, 'w') as f: for e in encoding: f.write('%s %d\n' % (e, encoding[e])) if logErr: sys.stderr.write('Write encoding table to %s\n' % enco)