目录
fluid:
https://github.com/PaddlePaddle/models/tree/develop/fluid/sequence_tagging_for_ner
v2:
https://github.com/PaddlePaddle/models/tree/develop/legacy/sequence_tagging_for_ner
掉渣天的徐老师已经在开源版本的paddlepaddle实现了rnn+crf,我们直接拿来学习学习就好啦!!!【此demo只在github版本有,且paddle的其他源都已经不维护了,诸君请自重~】
先看一下数据集的说明: http://www.clips.uantwerpen.be/conll2000/chunking/
#[NP He ] [VP reckons ] [NP the current account deficit ] [VP will narrow ] [PP to ] [NP only # 1.8 billion ] [PP in ] [NP September ] .
#
#He PRP B-NP
#reckons VBZ B-VP
#the DT B-NP
#current JJ I-NP
#account NN I-NP
#deficit NN I-NP
#will MD B-VP
#narrow VB I-VP
#to TO B-PP
#only RB B-NP
## # I-NP
#1.8 CD I-NP
#billion CD I-NP
#in IN B-PP
#September NNP B-NP
#. . O
#
#The O chunk tag is used for tokens which are not part of any chunk
#
#
首先,需要将文件组织成三列,第一列是单词(中文的单字),第二列是tag_pos(part-of-speech tag as derived by the Brill tagger),第三列是标签tag(B-x表示开始,I-x表示中间)。比如,我们写好了test.txt,那么我们运行gzip test.txt,这样,就生成了test.txt.gz(解压的时候gzip -dc test.txt.gz > test.txt,就会生成test.txt[如果不用-c参数,会默认删掉test.txt.gz])。
文件 | 行数 | 序列个数 |
---|---|---|
train.txt | 220663 | 8936 |
test.txt | 49389 | 2012 |
然后我们来仔细看一下dataprovider:
# Feature combination patterns.(crf要用到的特征?)
# [[-1,0], [0,0]] means previous token at column 0 and current token at
# column 0 are combined as one feature.
patterns = [
[[-2,0]],
[[-1,0]],
[[0,0]],
[[1,0]],
[[2,0]],
[[-1,0], [0,0]],
[[0,0], [1,0]],
[[-2,1]],
[[-1,1]],
[[0,1]],
[[1,1]],
[[2,1]],
[[-2,1], [-1,1]],
[[-1,1], [0,1]],
[[0,1], [1,1]],
[[1,1], [2,1]],
[[-2,1], [-1,1], [0,1]],
[[-1,1], [0,1], [1,1]],
[[0,1], [1,1], [2,1]],
]
dict_label = {
'B-ADJP': 0,
'I-ADJP': 1,
'B-ADVP': 2,
'I-ADVP': 3,
'B-CONJP': 4,
'I-CONJP': 5,
'B-INTJ': 6,
'I-INTJ': 7,
'B-LST': 8,
'I-LST': 9,
'B-NP': 10,
'I-NP': 11,
'B-PP': 12,
'I-PP': 13,
'B-PRT': 14,
'I-PRT': 15,
'B-SBAR': 16,
'I-SBAR': 17,
'B-UCP': 18,
'I-UCP': 19,
'B-VP': 20,
'I-VP': 21,
'O': 22
}
def make_features(sequence):
length = len(sequence)
num_features = len(sequence[0])
def get_features(pos):
if pos < 0:#比如,特征是[[-1,0]],但现在是第0个字,传进来pos就是0-1,但不能0-1啊,就标为#B1(向前超出了1个)
return ['#B%s' % -pos] * num_features
if pos >= length:#同样道理,向后超出了多少个
return ['#E%s' % (pos - length + 1)] * num_features
return sequence[pos]
for i in xrange(length):
for pattern in patterns:
fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern]) #例如,feature是[[0,1], [1,1], [2,1]],那么,fname可能是NNP/NNP/POS,如果是[[2,1]],那么,fname就是POS
sequence[i].append(fname)
#create_dictionaries函数中:
#cutoff: a list of numbers. If count of a feature is smaller than this,
# it will be ignored.
cutoff = [3, 1, 0]
cutoff += [3] * len(patterns)
cutoff=[3, 1, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
#if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
#i-th column.
OOV_POLICY_IGNORE = 0
OOV_POLICY_USE = 1
OOV_POLICY_ERROR = 2
oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
oov_policy=[0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
#return a list of dict for each column
def create_dictionaries(filename, cutoff, oov_policy):
def add_to_dict(sequence, dicts):
num_features = len(dicts)
for features in sequence:
l = len(features)
assert l == num_features, "Wrong number of features " + line
for i in xrange(l):
if features[i] in dicts[i]:
dicts[i][features[i]] += 1
else:
dicts[i][features[i]] = 1
num_features = len(cutoff)
dicts = []
for i in xrange(num_features):
dicts.append(dict())
f = gzip.open(filename, 'rb')
sequence = []
for line in f:
line = line.strip()
if not line:#空串,用于区分不同句子
make_features(sequence)
add_to_dict(sequence, dicts)
sequence = []
continue
features = line.split(' ')
sequence.append(features)
for i in xrange(num_features):
dct = dicts[i]
n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
todo = []
for k, v in dct.iteritems():
if v < cutoff[i]:
todo.append(k)
else:
dct[k] = n
n += 1
if oov_policy[i] == OOV_POLICY_USE:
# placeholder so that len(dct) will be the number of features
# including OOV
dct['#OOV#'] = 0
logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
for k in todo:
del dct[k]
f.close()
return dicts
def gen_sample(sequence):
num_features = len(dicts)
sample = [list() for i in xrange(num_original_columns)] #【num_original_columns=3】
if patterns:
sample.append([])
for features in sequence:
assert len(features) == num_features, \
"Wrong number of features: " + line
for i in xrange(num_original_columns):# 前三个
id = dicts[i].get(features[i], -1)
if id != -1:
sample[i].append(id)
elif oov_policy[i] == OOV_POLICY_IGNORE:
sample[i].append(0xffffffff)
elif oov_policy[i] == OOV_POLICY_ERROR:
logger.fatal("Unknown token: %s" % features[i])
else:
sample[i].append(0)
if patterns:
dim = 0
vec = []
for i in xrange(num_original_columns, num_features): #后面那些crf feature
id = dicts[i].get(features[i], -1)
if id != -1:
vec.append(dim + id)
elif oov_policy[i] == OOV_POLICY_IGNORE:
pass
elif oov_policy[i] == OOV_POLICY_ERROR:
logger.fatal("Unknown token: %s" % features[i])
else:
vec.ids.append(dim + 0)
dim += len(dicts[i])
sample[-1].append(vec)
return sample
实现的其实是sgd_crf,我们可以看到,模型结构图如下所示【不要急 慢慢看】:
首先,从train.txt中,我们可以发现:
另外,num_label_types=int(math.ceil(float(slot2_size)) / 8)) * 8 = 24
4个data_layer:
接下来是一个fc_layer(input=features,size=num_label_types=24,name=crf_input)
然后这个crf_input后面接了一个crf_layer(input=crf_input,label=chunk)
实现的其实是大标题3的双向lstm和大标题2的crf,我们可以看到,模型结构图如下所示【不要急 慢慢看】: