CRL固有表現データ

固有表現抽出の学習

実際のファイル

$ cat train.data

	He        PRP  B-NP
	reckons   VBZ  B-VP
	the       DT   B-NP
	current   JJ   I-NP
	account   NN   I-NP
	deficit   NN   I-NP
	will      MD   B-VP
	narrow    VB   I-VP
	to        TO   B-PP
	only      RB   B-NP
	#         #    I-NP
	1.8       CD   I-NP
	billion   CD   I-NP
	in        IN   B-PP
	September NNP  B-NP
	.         .    O
	
	He        PRP  B-NP
	reckons   VBZ  B-VP

$ cat template

	# Unigram
	U00:%x[-2,0]
	U01:%x[-1,0]
	U02:%x[0,0]
	U03:%x[1,0]
	U04:%x[2,0]
	U05:%x[-1,0]/%x[0,0]
	U06:%x[0,0]/%x[1,0]
	
	U10:%x[-2,1]
	U11:%x[-1,1]
	U12:%x[0,1]q
	U13:%x[1,1]
	U14:%x[2,1]
	U15:%x[-2,1]/%x[-1,1]
	U16:%x[-1,1]/%x[0,1]
	U17:%x[0,1]/%x[1,1]
	U18:%x[1,1]/%x[2,1]
	
	U20:%x[-2,1]/%x[-1,1]/%x[0,1]
	U21:%x[-1,1]/%x[0,1]/%x[1,1]
	U22:%x[0,1]/%x[1,1]/%x[2,1]

$ crf_learn -a MIRA template train.data model

	CRF++: Yet Another CRF Tool Kit
	Copyright(C)2005-2007 Taku Kudo, All rights reserved.
	
	reading training data:
	Done!0.00 s
	
	Number of sentences: 2
	Number of features:  1800
	Number of thread(s): 1
	Freq:                1
	eta:                 0.00010
	C:                   1.00000
	shrinking size:      20
	Algorithm:           MIRA
	
	iter=0 terr=0.66667 serr=0.50000 act=2 uact=0 obj=0.30126 kkt=12.00000
	iter=1 terr=0.16667 serr=0.50000 act=2 uact=0 obj=0.36494 kkt=2.84937
	iter=2 terr=0.00000 serr=0.00000 act=2 uact=0 obj=0.36494 kkt=0.00000
	iter=3 terr=0.00000 serr=0.00000 act=2 uact=0 obj=0.36494 kkt=0.00000
	
	Done!0.00 s

$ crf_test -m model test.data

	He      PRP     B-NP    B-NP
	reckons VBZ     B-VP    B-VP
	the     DT      B-NP    B-NP
	current JJ      I-NP    I-NP
	account NN      I-NP    I-NP
	deficit NN      I-NP    I-NP
	will    MD      B-VP    B-VP
	narrow  VB      I-VP    I-VP
	to      TO      B-PP    B-PP
	only    RB      B-NP    B-NP
	#       #       I-NP    I-NP
	1.8     CD      I-NP    I-NP
	billion CD      I-NP    I-NP
	in      IN      B-PP    B-PP
	September       NNP     B-NP    B-NP
	.       .       O       O
	
	He      PRP     B-NP    B-NP
	reckons VBZ     B-VP    B-VP