NLTK¤òÍøÍѤ·¤Æ¡¢TFID¤òµá¤á¤ë¡£
#! /usr/bin/env python
#encoding: utf-8
import nltk
import commands
#¥Õ¥¡¥¤¥ë̾¼èÆÀ
ls = commands.getoutput('/bin/ls Tw*')
files = ls.split('\n')
lists = []
AllWords = []
#¥Õ¥¡¥¤¥ë¤ò¥ê¥¹¥È¤Ë³ÊǼ¤¹¤ë
for file in files:
print file
list = []
for line in open(file, 'r'):
words = line[:-1].split(" ")
for word in words:
list.append(word)
AllWords.append(word)
lists.append(list)
print len(list)
A = nltk.TextCollection(lists);
print len(lists)
print A.tf("RT",AllWords)
print A.idf("RT")