import math ################################### Data Configuration ################################### word_dim = 4870 pos_dim = 110 label_dim = 4 TrainData(ProtoData(files = "train.list", type = "proto_sequence")) TestData(ProtoData(files = "test.list", type = "proto_sequence")) ################################### Algorithm Configuration ################################### Settings( algorithm='sgd', learning_rate_decay_a=0, learning_rate_decay_b=0, batch_size=12, learning_rate=0.01, learning_method='adagrad', #ada_epsilon=1.0, #num_batches_per_send_parameter=2, #num_batches_per_get_parameter=1, ) ################################### Network Configuration ################################### Inputs("word", "pos", "label", "place_holder") embedding_size = 256 hidden_dim = 256 sentence_vec_dim = 256 output_dim = 256 pos_embedding_size = 20 num_rnn_layers = 1 lr_hid_col = 1 lr_output = 1 lr_keep = 0.1 Layer(name = "word", type = "data", size = word_dim) Layer(name = "pos", type = "data", size = pos_dim) Layer(name = "label", type = "data", size = label_dim) Layer(name = "place_holder", type = "data", size = 1) name = "word" parameter_name = ["word"] offset = 0 MixedLayer( name = name + "_word_embedding", size = embedding_size, bias = False, inputs = TableProjection( name, initial_std = 1 / math.sqrt(embedding_size), learning_rate = lr_keep, parameter_name = "embedding", #sparse_remote_update=True ) ) for j in range(num_rnn_layers): input_name = (name + "_reverse_rnn%d" % (j-1)) if j > 0 else name + "_word_embedding" MixedLayer( name = name + "_rnn%d_input" % j, size = hidden_dim, bias = False, inputs = FullMatrixProjection(input_name, initial_std = 1 / math.sqrt(hidden_dim), parameter_name="%s_rnn%d_input_to_hidden" % (parameter_name[offset], j), learning_rate =lr_hid_col) ) Layer( name = name + "_rnn%d" % j, type = "lstmemory", active_type = "tanh", active_state_type = "tanh", active_gate_type = "sigmoid", bias = Bias(initial_std=0, parameter_name = '%s_rnn%d_bias' % (parameter_name[offset], j), learning_rate = lr_hid_col if j > 0 else lr_keep), inputs = Input(name + "_rnn%d_input" % j, initial_std = 1/math.sqrt(hidden_dim), parameter_name="%s_rnn%d_weight" % (parameter_name[offset], j), learning_rate = lr_hid_col) ) MixedLayer( name = name + "_reverse_rnn%d_input" % j, size = hidden_dim, bias = False, inputs = [FullMatrixProjection(name + "_rnn%d" % j, initial_std = 1 / math.sqrt(hidden_dim), parameter_name = "%s_reverse_rnn%d_input1_to_hidden" % (parameter_name[offset], j), learning_rate =lr_hid_col), FullMatrixProjection(input_name, initial_std = 1 / math.sqrt(hidden_dim), parameter_name = '%s_reverse_rnn%d_input2_to_hidden' % (parameter_name[offset], j), learning_rate = lr_hid_col)] ) Layer( name = name + "_reverse_rnn%d" % j, type = "lstmemory", active_type = "tanh", active_state_type = "tanh", active_gate_type = "sigmoid", reversed = True, bias = Bias(initial_std=0, parameter_name = '%s_reverse_rnn%d_bias' % (parameter_name[offset], j), learning_rate = lr_hid_col), inputs = Input(name + "_reverse_rnn%d_input" % j, initial_std = 1/math.sqrt(hidden_dim), parameter_name="%s_reverse_rnn%d_weight" % (parameter_name[offset], j), learning_rate = lr_hid_col) ) MixedLayer( name = name + "_embeddding_and_hidden", size = sentence_vec_dim, active_type = "relu", bias = False, inputs = [FullMatrixProjection(name + "_rnn%d" % j, initial_std = 0.000001 / math.sqrt(output_dim), parameter_name="%s_rnn%d_hidden_to_pooling" % (parameter_name[offset], j), learning_rate = lr_hid_col) for j in range(num_rnn_layers)] + [FullMatrixProjection(name + "_reverse_rnn%d" % j, initial_std = 0.000001 / math.sqrt(output_dim), parameter_name="%s_reverse_rnn%d_hidden_to_pooling" % (parameter_name[offset], j), learning_rate = lr_hid_col) for j in range(num_rnn_layers)] + [FullMatrixProjection(name + "_word_embedding", initial_std = 0.000001 / math.sqrt(output_dim), parameter_name = "%s_embedding_to_pooling" % (parameter_name[offset]), learning_rate = lr_hid_col)] ) MixedLayer( name = "pos_embedding", size = pos_embedding_size, bias = False, inputs = TableProjection( "pos", initial_std = 1 / math.sqrt(pos_embedding_size), learning_rate = lr_output, parameter_name = "pos_embedding", #sparse_remote_update=True ) ) FCLayer( name = "combine_vec", size = output_dim, active_type = "tanh", bias = Bias(initial_std = 0, parameter_name = 'combine_vec_bias', learning_rate = lr_output), inputs = [Input( name + "_embeddding_and_hidden", learning_rate = lr_output, initial_std = 1 / math.sqrt(sentence_vec_dim), parameter_name = 'combine_%s_weight' % name, )] + [Input( "pos_embedding", learning_rate = lr_output, initial_std = 1 / math.sqrt(pos_embedding_size), parameter_name = 'combine_pos_weight' )] ) FCLayer( name = "similarity", size = output_dim, active_type = "tanh", bias = Bias(initial_std = 0, parameter_name = 'similarity_bias', learning_rate = lr_output), inputs = [FullMatrixProjection( "combine_vec", learning_rate = lr_output, initial_std = 1 / math.sqrt(output_dim), parameter_name = 'similarity_weight' )] ) MixedLayer( name = "output", size = label_dim, #active_type = "softmax", #active_type = "sigmoid", #bias = Bias(initial_std = 0, parameter_name = 'output_bias', learning_rate = lr_output), bias = False, inputs = [FullMatrixProjection( "similarity", #learning_rate = lr_output, initial_std = 1 / math.sqrt(output_dim), #parameter_name = 'output_weight' )] ) CRFLayer( name = "crf_cost", size = label_dim, inputs = [ Input("output", parameter_name="crfw"), Input("label") ] ) Layer( name = "crf_layer", size = label_dim, type = 'crf_decoding', #bias = Bias(initial_std = 0, parameter_name = 'crf_bias', learning_rate = lr_output), inputs = [ Input("output"), #Input("label") ] ) Evaluator( name = "error", type = "sum", inputs = "crf_layer", ) #Evaluator( # name = "chunk_f1", # type = "chunk", # inputs = ["crf_layer", "label"], # #chunk_scheme = "plain", # chunk_scheme = "IOB", # #num_chunk_types = 3, # num_chunk_types = 2, #) Outputs("crf_cost")