Parsing HTML using BeautifulSoup
April 6, 2015 Leave a comment
This Python code that parses HTML seems to truncate the tags when I print it. I am attempting to check for the presence of the ID attribute in the tags. The code just iterates over all tags and it does not specifically look for a HTML control. It just matches the opening and closing tag arbitratrily. I am still working on it and will update it.
D:\Python Data Analytics\view.html No Tag 0 1 <div class="panel-collapse collapse" id="activ... 1 1 <select class="selectpicker " id="condition1... 2 1 <select class="selectpicker " id="condition2... 3 1 <select class="selectpicker " id="condition3... 4 1 <select class="selectpicker " id="condition4... 5 1 <select class="selectpicker " id="condition5... 6 1 <input class="btn xbtn-primary save" id="ApS... 7 1 <input class="btn btn-link" id="Cancel" name...
from bs4 import BeautifulSoup as bs import sys import os import pandas as pd import fnmatch class Parse: def __init__(self): self.parse() def parse(self): pd.options.display.max_colwidth = 0 try: path = "D:\\Python Data Analytics\\" f = open('D:\python\\report.html','w') #Pattern to be matched includes = ['*.html'] for root, subFolders, files in os.walk(path): for extensions in includes: for infile in fnmatch.filter(files, extensions): soup = bs(open( path + infile, "r").read()) data = soup.findAll(True,{'id':True}) df = pd.DataFrame(columns=[ 'ID', 'Tag']) idattributes = [] duplicates = [] for attribute in data: idTag = attribute.find('id') att = attribute.attrs idattributes.append(att['id']) df = df.append(pd.DataFrame( [dict( ID=att['id'], Tag=attribute)] ), ignore_index=True) s = set() duplicates = set(x for x in idattributes if x in s or s.add(x)) data1 = soup.findAll(attrs={'id': None}) df1 = pd.DataFrame(columns=[ 'Tag']) missingid = {} count = 0 for attribute in data1: missingid.update({count: attribute}) df1 = df1.append(pd.DataFrame( [dict( Tag=attribute)] ), ignore_index=True) count = count + 1 df2 = pd.DataFrame(missingid.items()) html5report = df print df2 html5report1 = df2 table = "" table += '<table>' for element in duplicates: table += ' <tr>' table += ' <td>' + element + '</td>' table += ' </tr>' table += '</table>' html5report1 = html5report1.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped">') html5report = html5report.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped">') htmlreporter = ''' <html> <head> <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"> <style>body{ margin:0 100; background:whitesmoke; }</style> </head> <body> <h1>HTML 5 Report</h1> <h2>''' + infile + '''</h2> <h3>Tags with ID present</h3> ''' + html5report + ''' <h3>Tags with ID not present</h3> ''' + html5report1 + ''' <h3>Possible Duplicates</h3> ''' + table + ''' </body> </html>''' f.write(htmlreporter) f.close() except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) except Exception, err: print "Unexpected error:", sys.exc_info()[0] if __name__ == '__main__': instance = Parse()