Parsing HTML using BeautifulSoup
April 6, 2015 Leave a comment
This Python code that parses HTML seems to truncate the tags when I print it. I am attempting to check for the presence of the ID attribute in the tags. The code just iterates over all tags and it does not specifically look for a HTML control. It just matches the opening and closing tag arbitratrily. I am still working on it and will update it.
D:\Python Data Analytics\view.html No Tag 0 1 <div class="panel-collapse collapse" id="activ... 1 1 <select class="selectpicker " id="condition1... 2 1 <select class="selectpicker " id="condition2... 3 1 <select class="selectpicker " id="condition3... 4 1 <select class="selectpicker " id="condition4... 5 1 <select class="selectpicker " id="condition5... 6 1 <input class="btn xbtn-primary save" id="ApS... 7 1 <input class="btn btn-link" id="Cancel" name...
from bs4 import BeautifulSoup as bs
import sys
import os
import pandas as pd
import fnmatch
class Parse:
def __init__(self):
self.parse()
def parse(self):
pd.options.display.max_colwidth = 0
try:
path = "D:\\Python Data Analytics\\"
f = open('D:\python\\report.html','w')
#Pattern to be matched
includes = ['*.html']
for root, subFolders, files in os.walk(path):
for extensions in includes:
for infile in fnmatch.filter(files, extensions):
soup = bs(open( path + infile, "r").read())
data = soup.findAll(True,{'id':True})
df = pd.DataFrame(columns=[
'ID',
'Tag'])
idattributes = []
duplicates = []
for attribute in data:
idTag = attribute.find('id')
att = attribute.attrs
idattributes.append(att['id'])
df = df.append(pd.DataFrame( [dict(
ID=att['id'],
Tag=attribute)] ),
ignore_index=True)
s = set()
duplicates = set(x for x in idattributes if x in s or s.add(x))
data1 = soup.findAll(attrs={'id': None})
df1 = pd.DataFrame(columns=[
'Tag'])
missingid = {}
count = 0
for attribute in data1:
missingid.update({count: attribute})
df1 = df1.append(pd.DataFrame( [dict(
Tag=attribute)] ),
ignore_index=True)
count = count + 1
df2 = pd.DataFrame(missingid.items())
html5report = df
print df2
html5report1 = df2
table = ""
table += '<table>'
for element in duplicates:
table += ' <tr>'
table += ' <td>' + element + '</td>'
table += ' </tr>'
table += '</table>'
html5report1 = html5report1.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped">')
html5report = html5report.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped">')
htmlreporter = '''
<html>
<head>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
<style>body{ margin:0 100; background:whitesmoke; }</style>
</head>
<body>
<h1>HTML 5 Report</h1>
<h2>''' + infile + '''</h2>
<h3>Tags with ID present</h3>
''' + html5report + '''
<h3>Tags with ID not present</h3>
''' + html5report1 + '''
<h3>Possible Duplicates</h3>
''' + table + '''
</body>
</html>'''
f.write(htmlreporter)
f.close()
except IOError as e:
print "I/O error({0}): {1}".format(e.errno, e.strerror)
except Exception, err:
print "Unexpected error:", sys.exc_info()[0]
if __name__ == '__main__':
instance = Parse()