2010年08月19日Python勉強会
をテンプレートにして作成
[
トップ
] [
新規
|
一覧
|
単語検索
|
最終更新
|
ヘルプ
]
開始行:
[[TopPage]]
***TFIDF [#mc607aac]
-1414ファイルのTFIDF
#! /usr/bin/env python
#encoding: utf-8
import nltk
import commands
#ファイル名取得
ls = commands.getoutput('/bin/ls Tw*')
#print len(ls)
files = ls.split('\n')
#print len(files)
#for file in files:
# print file
lists = []
vocab = {}
AllWords = []
otaruSum = 0
#ファイルをリストに格納する
for file in files:
print file
list = []
otaru = 0
for line in open(file, 'r'):
words = line[:-1].split(" ")
for word in words:
list.append(word)
AllWords.append(word)
if word == "@":
otaru = 1
if vocab.has_key(word):
vocab[word] += 1
else:
vocab[word] =1
lists.append(list)
print "RT" , list.count("RT")
if otaru == 1:
otaruSum += 1
print len(list)
A = nltk.TextCollection(lists);
print len(lists)
print A.tf("RT",AllWords)
print "RT:", A.idf("RT")
print "QT" ,A.idf("QT")
print "小樽:" ,A.idf("小樽")
print "東京:" , A.idf("東京")
print "札幌:" , A.idf("札幌")
print otaruSum
for file in files:
list = []
for line in open(file, 'r'):
words = line[:-1].split(" ")
for word in words:
list.append(word)
tfidf = list.count("RT")*A.idf("RT")
print file , "RT の TF=" , list.count("RT") , "IDF=" , A.idf("RT") , "TFxIDF=" , tfidf
***CRF in NLTK [#xb5cc6ef]
--http://nltk.googlecode.com/svn/trunk/doc/api/nltk.tag.crf.MalletCRF-class.html#__init__
終了行:
[[TopPage]]
***TFIDF [#mc607aac]
-1414ファイルのTFIDF
#! /usr/bin/env python
#encoding: utf-8
import nltk
import commands
#ファイル名取得
ls = commands.getoutput('/bin/ls Tw*')
#print len(ls)
files = ls.split('\n')
#print len(files)
#for file in files:
# print file
lists = []
vocab = {}
AllWords = []
otaruSum = 0
#ファイルをリストに格納する
for file in files:
print file
list = []
otaru = 0
for line in open(file, 'r'):
words = line[:-1].split(" ")
for word in words:
list.append(word)
AllWords.append(word)
if word == "@":
otaru = 1
if vocab.has_key(word):
vocab[word] += 1
else:
vocab[word] =1
lists.append(list)
print "RT" , list.count("RT")
if otaru == 1:
otaruSum += 1
print len(list)
A = nltk.TextCollection(lists);
print len(lists)
print A.tf("RT",AllWords)
print "RT:", A.idf("RT")
print "QT" ,A.idf("QT")
print "小樽:" ,A.idf("小樽")
print "東京:" , A.idf("東京")
print "札幌:" , A.idf("札幌")
print otaruSum
for file in files:
list = []
for line in open(file, 'r'):
words = line[:-1].split(" ")
for word in words:
list.append(word)
tfidf = list.count("RT")*A.idf("RT")
print file , "RT の TF=" , list.count("RT") , "IDF=" , A.idf("RT") , "TFxIDF=" , tfidf
***CRF in NLTK [#xb5cc6ef]
--http://nltk.googlecode.com/svn/trunk/doc/api/nltk.tag.crf.MalletCRF-class.html#__init__
ページ名: