main.py 3.14 KB
import numpy
import time
import sys
import subprocess
import os
import random

from is13.data import load
from is13.rnn.nnet_for_dependency_trees import model
from is13.metrics.accuracy import conlleval
from is13.utils.tools import shuffle, minibatch, contextwin

import numpy
import theano.tensor as T
import theano

from keras.preprocessing import sequence as seq

from collections import OrderedDict

import itertools

def load_conll_data(conll_format_data):

    
    #
    # to do: trzeba jeszcze zmienic zeby byly id slow w macierzy embeddingow, a nie pozycje w zdaniu
    #
    
    label_trans = {'_\n':0, 'A\n':1, 'A':1, 'T':1, 'T\n':1}
    
    sentences = []
    #maxlen = 0
    k = 0
    y = 0
    with open(conll_format_data) as fr:
        s = []
        for line in fr:

            if len(line) < 2:

                for i in range(len(s)):
                    childrens = []
                    for j in range(len(s)):
                        if s[j][1] == i+1:
                            childrens.append(s[j][0])
                    s[i].append(childrens)

                words = [x[0] for x in s]
                childrens = seq.pad_sequences([x[3] for x in s], padding='post', value = -1)
                if len(s)>2: 
		    sentences.append((words,childrens,y))
                s = []
                k = 0

            else:
                toks = line.split(' ')
                word = toks[1].decode('utf8')
                parent = int(toks[6])
                sentiment = int(toks[-4] == 'S')
		if parent == 0: # to oznacza, ze dane slowo jest korzeniem frazy
		    y = sentiment
                #print(word, label,label_trans[label])
                s.append( [k+1, parent, sentiment] )
                k = k +1
    return sentences
                

if __name__ == '__main__':

    s = {'lr':0.0627142536696559,
         'verbose':1,
         'decay':False, # decay on the learning rate if improvement stops
         'nepochs':2,
         'seed':345,

         'de':10, # dimension of word embedding
         'nh':10, # dimension of hidden state
         'nc':2 , # number of y classens
         'ne':50, # vocabulary size
         'ds':10}  # dimension of sentiment state

    conll_format_data = '/home/rexamine/Doktorat/Opinion_Targets/opta-tagger/train_data/conll-format/train.conll'
    data = load_conll_data(conll_format_data)[0:2]

    nsentences = len(data)

    # instanciate the model
    numpy.random.seed(s['seed'])
    random.seed(s['seed'])
    rnn = model(    nh = s['nh'],
                    nc = s['nc'],
                    ne = s['ne'],
                    de = s['de'],
                    ds = s['ds'])

    # train #to do: with early stopping on validation set
   
    s['clr'] = s['lr']
    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([data], s['seed'])
        #s['ce'] = e
        tic = time.time()
        for i in xrange(nsentences):
            rnn.train(data[i][0],data[i][1], data[i][2], s['clr'])
            rnn.normalize()

            if s['verbose']:
                print ('[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic))
                sys.stdout.flush()