Frequency of occurence of a term in a tweet dataset
July 16, 2014 Leave a comment
I am a novice python coder and this algorithm is simple. But still I am overjoyed that my python coding skills are improving.
No: of occurences of a term / Total No: of unique words
import json import os.path import re import sys class Frequency(object): def __init__(self): if not (os.path.isfile(sys.argv[1]) and os.access(sys.argv[1], os.R_OK)): print "Either file is missing or it is not readable" self.allterms = {} def totalterms(self,text): count = 0 tweet = text.split() for s in tweet: if not self.allterms: self.allterms[s.lower()] = count else: if not (s.lower() in self.allterms): self.allterms[s.lower()] = count def calculatefrequency(self,text): tweet = text.split() for s in tweet: if (self.allterms.has_key(s.lower())): self.allterms[s.lower()] = float(self.allterms[s.lower()] + 1) for key in self.allterms.iterkeys(): self.allterms[key] = float(self.allterms[key] / len(self.allterms.keys())) def analyze(self): with open(sys.argv[1],'r') as f: for data in f: d = json.loads(data) try: # print json-formatted string #print json.dumps(d, sort_keys=True, indent=4) if d.get('text') and d.get('lang') == 'en': #print "Tweet: ", d['text'] tex = re.sub("[^A-Z\sa-z]", "", d['text']) Frequency.totalterms(self,tex) Frequency.calculatefrequency(self, tex) except (ValueError, KeyError, TypeError): print "Error" for key,value in self.allterms.iteritems(): print(str(key) + " " + str("%.6f" %value)) if __name__ == '__main__': frequency=Frequency() frequency.analyze()
nigga 0.000027 old 0.000027 worldcup 0.000002 list 0.000027 it 0.000002 years 0.000027 see 0.025000 done 0.000004 have 0.025000 shit 0.000027 rt 0.000002 from 0.025000 also 0.000002 top 0.000027 had 0.000002 guitarmandan 0.000002 to 0.000004 win 0.000002 you 0.050000 today 0.000027 me 0.025027 fr 0.000781 someone 0.000002 but 0.000781 moment 0.025000 germany 0.000002 no 0.025000 not 0.025781 come 0.000027 cool 0.000027 a 0.000027 on 0.000027 like 0.000027 of 0.000027 hes 0.000027 well 0.000004 chance 0.025000 calling 0.000027 caring 0.025000 the 0.025027