Frequency of occurence of a term in a tweet dataset

I am a novice python coder and this algorithm is simple. But still I am overjoyed that my python coding skills are improving.

No: of occurences of a term / Total No: of unique words

import json
import os.path
import re
import sys


class Frequency(object):

 
   def __init__(self):
        if not (os.path.isfile(sys.argv[1]) and os.access(sys.argv[1], os.R_OK)):
            print "Either file is missing or it is not readable"
        self.allterms = {}


   def totalterms(self,text):
           count  = 0
           tweet = text.split()
           for s in tweet:
                    if not  self.allterms:
                        self.allterms[s.lower()] = count
                    else:
                        if not (s.lower() in self.allterms):        
                            self.allterms[s.lower()] = count

   def calculatefrequency(self,text):
           tweet = text.split()
           for s in tweet:
                        if  (self.allterms.has_key(s.lower())):        
                            self.allterms[s.lower()] = float(self.allterms[s.lower()] + 1)
           for key in self.allterms.iterkeys():
              self.allterms[key] = float(self.allterms[key] / len(self.allterms.keys()))
                
   def analyze(self):
        with open(sys.argv[1],'r') as f:
            for data in f:
                d = json.loads(data)
                try: 
                    # print json-formatted string
                    #print json.dumps(d, sort_keys=True, indent=4)
                 
                    if d.get('text') and d.get('lang') == 'en':
                            #print "Tweet: ", d['text']
                            tex = re.sub("[^A-Z\sa-z]", "", d['text'])
                            Frequency.totalterms(self,tex)
                            Frequency.calculatefrequency(self, tex)

                except (ValueError, KeyError, TypeError):
                    print "Error"
        for key,value in self.allterms.iteritems():
            print(str(key) + " " + str("%.6f" %value))              
            
                  
if __name__ == '__main__':


    frequency=Frequency()
    frequency.analyze()
nigga 0.000027
old 0.000027
worldcup 0.000002
list 0.000027
it 0.000002
years 0.000027
see 0.025000
done 0.000004
have 0.025000
shit 0.000027
rt 0.000002
from 0.025000
also 0.000002
top 0.000027
had 0.000002
guitarmandan 0.000002
to 0.000004
win 0.000002
you 0.050000
today 0.000027
me 0.025027
fr 0.000781
someone 0.000002
but 0.000781
moment 0.025000
germany 0.000002
no 0.025000
not 0.025781
come 0.000027
cool 0.000027
a 0.000027
on 0.000027
like 0.000027
of 0.000027
hes 0.000027
well 0.000004
chance 0.025000
calling 0.000027
caring 0.025000
the 0.025027

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: