[[TopPage]]

*** NLTK TFIDF [#s63f9719]
+http://nltk.googlecode.com/svn/trunk/doc/api/nltk.text.TextCollection-class.html
--import nltk
--from nltk.book import *
+NLTKを利用して、TFIDを求める。
	#! /usr/bin/env python
	#encoding: utf-8
	import nltk
	import commands
	
	#ファイル名取得
	ls = commands.getoutput('/bin/ls Tw*')
	files = ls.split('\n')
	
	lists = []
	AllWords = []
	#ファイルをリストに格納する
	for file in files:
	        print file
	        list = []
	        for line in open(file, 'r'):
	                words = line[:-1].split(" ")
	                for word in words:
	                        list.append(word)
	                        AllWords.append(word)
	        lists.append(list)
	        print len(list)
	
	A = nltk.TextCollection(lists);
	
	print len(lists)
	print A.tf("RT",AllWords)
	print A.idf("RT")