实践参照博文:中文分词入门之字标注法4
步骤如下:
1. 下载 CRF++-0.58.tar.gz, 解压到 C:\path\crf++-0.58 下
2. 用 SWIG 工具得到 _CRFPP.pyd,CRFPP.py 两个文件。参见本博:Windows 7 x64系统安装CRFsuite给Python使用
3. 创建 模板文件 tmpl.txt 如下:
# Unigram
U00:%x[-2,0]
U01:%x[-1,0]
U02:%x[0,0]
U03:%x[1,0]
U04:%x[2,0]
U05:%x[-1,0]/%x[0,0]
U06:%x[0,0]/%x[1,0]
U10:%x[-2,1]
U11:%x[-1,1]
U12:%x[0,1]q
U13:%x[1,1]
U14:%x[2,1]
U15:%x[-2,1]/%x[-1,1]
U16:%x[-1,1]/%x[0,1]
U17:%x[0,1]/%x[1,1]
U18:%x[1,1]/%x[2,1]
U20:%x[-2,1]/%x[-1,1]/%x[0,1]
U21:%x[-1,1]/%x[0,1]/%x[1,1]
U22:%x[0,1]/%x[1,1]/%x[2,1]
# Bigram
B
4. 编译 crf_learn.exe,crf_test.exe,或下载 CRF++-0.58.zip
5. 从 Bakeoff 2005 下载训练及测试语料 icwb2-data
6. 将需要用到的文件复制到同一目录
7. 用下列代码将训练语料转换成crf++需要的格式,python make_crf_train_data.py pku_train.utf8 pku_training_out.utf8
make_crf_train_data.py 代码:
# coding: utf-8
'''
make_crf_train_data.py
得到CRF++要求的格式的训练文件
用法:命令行--python make_crf_train_data.py input_file output_file
4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
'''
import codecs
import sys
def character_tagging(input_file, output_file):
'''
将输入文件内容进行标注输出到 output_file
'''
input_data = codecs.open(input_file, 'r', 'utf-8')
output_data = codecs.open(output_file, 'w', 'utf-8')
for line in input_data.readlines():
word_list = line.strip().split()
for word in word_list:
if len(word) == 1:
output_data.write(word + " S\n")
else:
output_data.write(word[0] + " B\n")
for w in word[1:len(word)-1]:
output_data.write(w + " M\n")
output_data.write(word[len(word)-1] + " E\n")
output_data.write("\n")
input_data.close()
output_data.close()
if __name__ == '__main__':
if len(sys.argv) == 1:
input_file = 'pku_training.utf8'
output_file = 'pku_training_out.utf8'
elif len(sys.argv) != 3:
print len(sys.argv)
print "pls use: python make_crf_train_data.py input output"
sys.exit()
else:
input_file = sys.argv[1]
output_file = sys.argv[2]
character_tagging(input_file, output_file)
.
8. 训练:crf_learn tmpl.txt pku_training_out.utf8 pku.model
,得到大约21MB的model
9. 使用 python make_crf_test_data.py pku_test.utf8 pku_test_out.utf8
转换测试语料
10. 测试:crf_test -m pku.model pku_test_out.utf8 > pku_test_result.utf8
得到标注文件,还要用脚本进行转换,略繁琐,可以跳过直接进入下一步
11. 执行 python crf_segmenter.py pku.model pku_test.utf8 pku_test_word.utf8
得到分词输出结果,代码如下:
# coding:utf-8
'''
Author: 52nlpcn@gmail.com
Copyright 2014 @ YuZhen Technology
CRF Segmenter based character tagging:
4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
'''
import codecs
import sys
import CRFPP
def crf_segmenter(input_file, output_file, tagger):
input_data = codecs.open(input_file, 'r', 'utf-8')
output_data = codecs.open(output_file, 'w', 'utf-8')
for line in input_data.readlines():
tagger.clear()
for word in line.strip():
word = word.strip()
if word:
tagger.add((word + "\to\tB").encode('utf-8'))
tagger.parse()
size = tagger.size()
xsize = tagger.xsize()
for i in range(0, size):
for j in range(0, xsize):
char = tagger.x(i, j).decode('utf-8')
tag = tagger.y2(i)
if tag == 'B':
output_data.write(char)
elif tag == 'M':
output_data.write(char)
elif tag == 'E':
output_data.write(char + ' ')
else: # tag == 'S'
output_data.write(char + ' ')
output_data.write('\n')
input_data.close()
output_data.close()
if __name__ == '__main__':
if len(sys.argv) != 4:
print "pls use: python crf_segmenter.py model input output"
sys.exit()
crf_model = sys.argv[1]
input_file = sys.argv[2]
output_file = sys.argv[3]
tagger = CRFPP.Tagger("-m " + crf_model)
crf_segmenter(input_file, output_file, tagger)
12 用 crf_tag_score.py pku_test_gold.utf8 pku_test_word.utf8
对分词结果进行评测,代码如下:
import sys
import codecs
"""
通过与黄金标准文件对比分析中文分词效果.
使用方法:
python crf_tag_score.py test_gold.utf8 your_tagger_output.utf8
分析结果示例如下:
标准词数:104372 个,正确词数:96211 个,错误词数:6037 个
标准行数:1944,正确行数:589 ,错误行数:1355
Recall: 92.1808531024%
Precision: 94.0957280338%
F MEASURE: 93.1284483593%
参考:中文分词器分词效果的评测方法
http://ju.outofmemory.cn/entry/46140
"""
def read_line(f):
'''
读取一行,并清洗空格和换行
'''
line = f.readline()
line = line.strip('\n').strip('\r').strip(' ')
while (line.find(' ') >= 0):
line = line.replace(' ', ' ')
return line
if __name__ == "__main__":
if len(sys.argv)!=3:
print ' 用法:crf_score.py test_gold.utf8 your_tagger_output.utf8'.decode('utf8')
sys.exit(1)
file_gold = codecs.open(sys.argv[1], 'r', 'utf8')
file_tag = codecs.open(sys.argv[2], 'r', 'utf8')
line1 = read_line(file_gold)
N_count = 0
e_count = 0
c_count = 0
e_line_count = 0
c_line_count = 0
while line1:
line2 = read_line(file_tag)
list1 = line1.split(' ')
list2 = line2.split(' ')
count1 = len(list1) # 标准分词数
N_count += count1
if line1 == line2:
c_line_count += 1
c_count += count1
else:
e_line_count += 1
count2 = len(list2)
arr1 = []
arr2 = []
pos = 0
for w in list1:
arr1.append(tuple([pos, pos + len(w)]))
pos += len(w)
pos = 0
for w in list2:
arr2.append(tuple([pos, pos + len(w)]))
pos += len(w)
for tp in arr2:
if tp in arr1:
c_count += 1
else:
e_count += 1
line1 = read_line(file_gold)
R = c_count * 100. / N_count
P = c_count * 100. / (c_count + e_count)
F = 2. * P * R / (P + R)
ER = 1. * e_count / N_count
print ' 标准词数:{} 个,正确词数:{} 个,错误词数:{} 个'.format(N_count, c_count, e_count).decode('utf8')
print ' 标准行数:{},正确行数:{} ,错误行数:{}'.format(c_line_count+e_line_count, c_line_count, e_line_count).decode('utf8')
print ' Recall: {}%'.format(R)
print ' Precision: {}%'.format(P)
print ' F MEASURE: {}%'.format(F)
print ' ERR RATE: {}%'.format(ER)
使用以上步骤训练 pku_training.utf8 得到的 model,测试 pku_test.uft8 的结果:
标准词数:104372 个,正确词数:96211 个,错误词数:6037 个
标准行数:1944,正确行数:589 ,错误行数:1355
Recall: 92.1808531024%
Precision: 94.0957280338%
F MEASURE: 93.1284483593%
ERR RATE: 0.0578411834592%
测试 msr_test.utf8 的结果:
标准词数:106873 个,正确词数:93404 个,错误词数:17915 个
标准行数:3985,正确行数:754 ,错误行数:3231
Recall: 87.3971910585%
Precision: 83.9066107313%
F MEASURE: 85.6163379042%
ERR RATE: 0.167628867909%