»ÃÄêÈÇ 11/09
$file = "";#¥¿¥°Éեǡ¼¥¿
$fileOut = "";#CRF++·Á¼°¤Î¥Õ¥¡¥¤¥ë¤ØÊÑ´¹
open(IN,$file);
open(OUT,">$fileOut");
while(<IN>){
chomp;
#¥¿¥°¤òºï½ü¤¹¤ë
$after = $before =$_;
print "[Á°]$before\n";
$after =~ s/<.*?>//g;
print "[¸å]$after\n";
#¥¿¥°¤¬ÉÕ¤¤¤Æ¤¤¤ëñ¸ì¤ò³ÊǼ¤¹¤ë¡£
@tmp = split(/<\//);
undef %NE;
foreach(@tmp){
if(/^(.*)<(.*?)>(.*?)$/){
$pre =$1;
$name =$2;
$value =$3;
$NE{$value} .= "$name";
}
}
#¥¿¥°¤òºï½ü¤·¤¿¥Ç¡¼¥¿¤ò·ÁÂÖÁDzòÀϤ¹¤ë¡£
open(MECAB,">mecab.input");
print MECAB "$after";
close(MECAB);
#Mecab¼Â¹Ô
@M = `/usr/local/bin/mecab -Ocrf < mecab.input`;
chomp @M;
#IOB¥¿¥°
#B: ¥Á¥ã¥ó¥¯¤ÎÀèÆ¬
#E: ¥Á¥ã¥ó¥¯¤ÎËöÈø
#I : ¥Á¥ã¥ó¥¯¤ÎÆâÉô
#S: °ì¤Ä¤Î¸ì¤Ç¥Á¥ã¥ó¥¯¤ò¹½À®¤¹¤ë
#O: ¥Á¥ã¥ó¥¯¤Î³°Éô
undef $comb;
undef @Mp;#IOB¥¿¥°³ÊǼ
for($i=0;$i<@M;$i++){
@Ms = split(/\s/,$M[$i]);
#print "$i $Ms[0] --> $M[$i]\n";
#¥Þ¥Ã¥Á¥ó¥°¤Î¤¿¤á¤Ë·ë¹ç
$comb .= "$Ms[0] ";
@combs = split(/ /,$comb);
#¸å¤í¤«¤é·ë¹ç¤·¤Æ¥Þ¥Ã¥Á¤·¤¿Ã±¸ì¤ËIOB¥¿¥°¤òÉÕÍ¿¡Ê»ÃÄêÈÇ¡Ë
undef $comb2;
for($j=@combs-1;$j>=0;$j--){
#$i ¤Þ¤Ç¤Î¾õÂÖ¤ò¸å¤í¤«¤é·ë¹ç
$comb2 = $combs[$j] . $comb2;
#¥¿¥°ÉÕ¤±¤µ¤ì¤¿Ã±¸ì¤È°ìÃפ·¤¿¾ì¹ç¡¢IOB¥¿¥°¤òÉÕÍ¿¤¹¤ë¡£
if($NE{$comb2}){
$n = @combs - $j -1;
#B¥¿¥°¤À¤±¤ÏºÇ½é¤Ë¤Ä¤±¤ë
$Mp[$i-$n] = "B-$NE{$comb2}";
#I¥¿¥°¤ÏB¥¿¥°¤Î¸å¤Ë¤Ä¤±¤ë
for($k=$i-$n+1;$k<=$i;$k++){
$Mp[$k] = "I-$NE{$comb2}";
}
last;
}
}
}
#print OUT "\n$before\n";
for($i=0;$i<@M;$i++){
if($Mp[$i]){
print OUT "$M[$i] $Mp[$i]\n";
}
else{
#¥¿¥°¤¬ÉÕ¤±¤é¤ì¤Æ¤¤¤Ê¤¤¾ì¹ç¡¢O¥¿¥°¤ò¤Ä¤±¤ë
print OUT "$M[$i] O\n";
}
}
}
close(OUT);
close(IN);