1414¥Õ¥¡¥¤¥ë¤ÎTFIDF
#! /usr/bin/env python
#encoding: utf-8
import nltk
import commands
#¥Õ¥¡¥¤¥ë̾¼èÆÀ
ls = commands.getoutput('/bin/ls Tw*')
#print len(ls)
files = ls.split('\n')
#print len(files)
#for file in files:
# print file
lists = []
vocab = {}
AllWords = []
otaruSum = 0
#¥Õ¥¡¥¤¥ë¤ò¥ê¥¹¥È¤Ë³ÊǼ¤¹¤ë
for file in files:
print file
list = []
otaru = 0
for line in open(file, 'r'):
words = line[:-1].split(" ")
for word in words:
list.append(word)
AllWords.append(word)
if word == "@":
otaru = 1
if vocab.has_key(word):
vocab[word] += 1
else:
vocab[word] =1
lists.append(list)
print "RT" , list.count("RT")
if otaru == 1:
otaruSum += 1
print len(list)
A = nltk.TextCollection(lists);
print len(lists)
print A.tf("RT",AllWords)
print "RT:", A.idf("RT")
print "QT" ,A.idf("QT")
print "¾®Ã®:" ,A.idf("¾®Ã®")
print "Åìµþ:" , A.idf("Åìµþ")
print "»¥ËÚ:" , A.idf("»¥ËÚ")
print otaruSum
for file in files:
list = []
for line in open(file, 'r'):
words = line[:-1].split(" ")
for word in words:
list.append(word)
tfidf = list.count("RT")*A.idf("RT")
print file , "RT ¤Î TF=" , list.count("RT") , "IDF=" , A.idf("RT") , "TFxIDF=" , tfidf