cp -r /home/maeda/TwitterLog/20100805/ .
#! /usr/bin/env python #encoding: utf-8 import nltk raw = open('./sample.mecab').read() words = raw.split(); print len(words) text = nltk.Text(words) gen = text.generate(300) print gen
#!/usr/bin/python #encoding: utf-8 import nltk raw = open('sample.dat').read() words = raw.split(); bigrams = nltk.bigrams(words) fd = nltk.FreqDist(bigrams) for w in fd: if fd[w]==100 : break; print w[0],w[1],fd[w] #ʸ»ú²½¤±¤¹¤ë cfd = nltk.ConditionalFreqDist(bigrams) print cfd['»ä']
find . -name "*.dat" | xargs sed -i "s/^M//g"
[Ctrl]+[v] + [Ctrl] + [M]
#vim fileencoding:utf-8 import commands import codecs list = commands.getoutput('ls *.dat') fileList = list.split("\n") datList = [] for file in fileList: for dat in codecs.open(file,'r','utf-8'): datList.append(dat.encode('utf-8')) tweetList = [] swaplist = [] tw = str() for dats in datList: swaplist = dats.split() index = swaplist.index('2010')#2010°Ê²¼¤¬tweet tw = "" for tweet in swaplist[index + 1:]: tw += tweet tweetList.append(tw) f = open('tweetList.txt', 'w') for tweet in tweetList: f.write(tweet+'\n') f.close()
#!/usr/bin/env python for line in open('TwitterLog20100805-1600.dat', 'r'): print line
strs = "abc"; f = open('Comment.txt', 'w') f.writelines(strs) f.close()
#!/usr/bin/env python f = open('Comment.txt', 'w') for line in open('TwitterLog20100805-1600.dat', 'r'): print line f.writelines(line) f.close()
#!/usr/bin/env python f = open('Comment.txt', 'w') for line in open('TwitterLog20100805-1600.dat', 'r'): print line items = line.split("\t") print items[3] f.writelines(items[3]) f.close()