Parse JSP using BeautifulSoup
July 18, 2013 Leave a comment
I had to parse a tangle of JSP’s to identify how many HTML controls were calling JavaScript
functions that make AJAX calls back to the application.
So if a ‘key press’ event is fired when a user ‘tabs out’ or presses ‘Enter’ on a
textbox then I wanted the scan to find that.
<html:text maxlength="30" onblur="blurAction(this)" onfocus="displayFieldMsg(this)" onkeydown="keyDownEvents(this)" onkeypress="keyPressEvents(this)" onkeyup="convertUCase(this)" property="txtExtCredit" size="40" style="text-align:left;" styleclass="inputfld"></html:text>
My python skills are rudimentary but this code is able to scan and show a list of ‘html:text’ Struts tags. PyDev eclipse plugin comes in handy for python development.
The code can be further enhanced for more complex scans which I plan to do.
from bs4 import BeautifulSoup
import fnmatch
import sys
import re
import os
import glob
class Parse:
def __init__(self):
print 'parsing'
self.parse()
#self.folderwalk()
def parse(self):
try:
path = "D:\\path"
for infile in glob.glob(os.path.join(path, "*.jsp")):
markup = (infile)
print markup
soup = BeautifulSoup(open(markup, "r").read())
data=soup.findAll(re.compile('^html:text'),attrs={'onkeypress':re.compile('^keyPressEvents')})
for i in data:
print i
except IOError as e:
print "I/O error({0}): {1}".format(e.errno, e.strerror)
except:
print "Unexpected error:", sys.exc_info()[0]
print "Unexpected error:", markup
# Not used at this time
def folderwalk(self):
rootdir = "D:\\path"
folderlist =0, []
#Pattern to be matched
includes = ['*.jsp']
try:
for root, subFolders, files in os.walk(rootdir):
for extensions in includes:
for filename in fnmatch.filter(files, extensions):
print filename
#folderlist.append()
except IOError as e:
print "I/O error({0}): {1}".format(e.errno, e.strerror)
except:
print "Unexpected error:", sys.exc_info()[0]
if __name__ == '__main__':
instance = Parse()