Parsing HTML using BeautifulSoup

April 6, 2015 Leave a comment

This Python code that parses HTML seems to truncate the tags when I print it. I am attempting to check for the presence of the ID attribute in the tags. The code just iterates over all tags and it does not specifically look for a HTML control. It just matches the opening and closing tag arbitratrily. I am still working on it and will update it.

D:\Python Data Analytics\view.html
   No                                                Tag
0   1  <div class="panel-collapse collapse" id="activ...
1   1  <select class="selectpicker " id="condition1...
2   1  <select class="selectpicker " id="condition2...
3   1  <select class="selectpicker " id="condition3...
4   1  <select class="selectpicker " id="condition4...
5   1  <select class="selectpicker " id="condition5...
6   1  <input class="btn xbtn-primary save" id="ApS...
7   1  <input class="btn btn-link" id="Cancel" name...

from bs4 import BeautifulSoup as bs
import sys
import os
import pandas as pd
import fnmatch

class Parse:
 
    def __init__(self):
        self.parse()


    def parse(self):
        
        pd.options.display.max_colwidth = 0
        try:
            path = "D:\\Python Data Analytics\\"
            
            f = open('D:\python\\report.html','w')

 
             #Pattern to be matched
            includes = ['*.html']
        
            for root, subFolders, files in os.walk(path):
                 for extensions in includes:
                     
                    for infile in fnmatch.filter(files, extensions):
                            soup = bs(open( path + infile, "r").read())
                                        
                            data = soup.findAll(True,{'id':True})
                            
                            df = pd.DataFrame(columns=[
                                                       'ID',
                                                       'Tag'])

                            idattributes = []
                            duplicates = [] 
                            
                            for attribute in data:
                                idTag = attribute.find('id')
                                att = attribute.attrs
                                idattributes.append(att['id'])
                                df = df.append(pd.DataFrame( [dict(
                                                                   ID=att['id'],
                                                                   Tag=attribute)] ),
                                                                   ignore_index=True)
                            s = set()
                            duplicates = set(x for x in idattributes if x in s or s.add(x))  
                                                              
                            data1 = soup.findAll(attrs={'id': None})
                            df1 = pd.DataFrame(columns=[
                                                       
                                                       'Tag'])
            
                            missingid = {} 
                            count = 0
                            for attribute in data1:
                                    missingid.update({count: attribute})
                                    df1 = df1.append(pd.DataFrame( [dict(
                                                                   Tag=attribute)] ),
                                                                   ignore_index=True)
                                    count = count + 1
                                    
                            df2 = pd.DataFrame(missingid.items())
                            html5report = df
                            print df2
                            html5report1 = df2
                            
                            table = ""
                            table += '<table>'
                            for element in duplicates:
                                table += '  <tr>'
                                table += '    <td>' + element + '</td>'
                                table += '  </tr>'
                            table += '</table>'
                            
                            html5report1 = html5report1.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped">')
                            html5report = html5report.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped">')
                            htmlreporter = '''
            							<html>
                							<head>
                    						<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
                    						<style>body{ margin:0 100; background:whitesmoke; }</style>
                							</head>
                                            <body>
                                            <h1>HTML 5 Report</h1>
                                            <h2>''' + infile + '''</h2>
            
                                            <h3>Tags with ID present</h3>
                   								''' + html5report + '''
                                            <h3>Tags with ID not present</h3>
                                                ''' + html5report1 + '''
                                            <h3>Possible Duplicates</h3>
                                                ''' + table + '''
                							</body>
            				</html>'''
                            f.write(htmlreporter)
            f.close()    
                            
        except IOError as e:
            print "I/O error({0}): {1}".format(e.errno, e.strerror)
        except Exception, err:
            print "Unexpected error:", sys.exc_info()[0]
 

if __name__ == '__main__':
    instance = Parse()

Filed under Python

M	T	W	T	F	S	S
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30

MindSpace

Parsing HTML using BeautifulSoup

Leave a comment Cancel reply

Blogroll