#! /usr/bin/env python #encoding: utf-8 import nltk import commands #ファイル名取得 ls = commands.getoutput('/bin/ls Tw*') #print len(ls) files = ls.split('\n') #print len(files) #for file in files: # print file lists = [] vocab = {} AllWords = [] otaruSum = 0
#ファイルをリストに格納する
for file in files: print file list = [] otaru = 0 for line in open(file, 'r'): words = line[:-1].split(" ") for word in words: list.append(word) AllWords.append(word) if word == "@": otaru = 1 if vocab.has_key(word): vocab[word] += 1 else: vocab[word] =1 lists.append(list) print "RT" , list.count("RT") if otaru == 1: otaruSum += 1 print len(list) A = nltk.TextCollection(lists); print len(lists) print A.tf("RT",AllWords) print "RT:", A.idf("RT") print "QT" ,A.idf("QT") print "小樽:" ,A.idf("小樽") print "東京:" , A.idf("東京") print "札幌:" , A.idf("札幌") print otaruSum for file in files: list = [] for line in open(file, 'r'): words = line[:-1].split(" ") for word in words: list.append(word) tfidf = list.count("RT")*A.idf("RT") print file , "RT の TF=" , list.count("RT") , "IDF=" , A.idf("RT") , "TFxIDF=" , tfidf