1
+ """
2
+ Farhan Javed - 11-20-2019
3
+ Basic Web scraping tutorial with example for working with JSON.
4
+ Also includes examples for working with HTML and XML files.
5
+ """
6
+
7
+
8
+ import urllib .request as request
9
+ import json
10
+ from html .parser import HTMLParser
11
+ import xml .dom .minidom
12
+ # Python JSON class : https://docs.python.org/3/library/json.html
13
+
14
+
15
+ def printResults (data ):
16
+ theJSON = json .loads (data )
17
+
18
+ if "title" in theJSON ["metadata" ]:
19
+ print (theJSON ["metadata" ]["title" ])
20
+ count = theJSON ["metadata" ]["count" ]
21
+ print (str (count ) + " events recorded" )
22
+
23
+ for i in theJSON ["features" ]:
24
+ print (i ["properties" ]["place" ])
25
+ print ("-----------------\n " )
26
+
27
+ for i in theJSON ["features" ]:
28
+ if i ["properties" ]["mag" ] >= 4.0 :
29
+ print ("%2.1f" % i ["properties" ]["mag" ], i ["properties" ]["place" ])
30
+ print ("-----------------\n " )
31
+
32
+ print ("Events that were felt: " )
33
+ for i in theJSON ["features" ]:
34
+ feltReports = i ["properties" ]["felt" ]
35
+ if feltReports != None :
36
+ print ("%2.1f" % i ["properties" ]["mag" ], i ["properties" ]["place" ],
37
+ " reported " + str (feltReports ) + " times" )
38
+
39
+
40
+ def fetchHTML ():
41
+
42
+ web_url = request .urlopen ("http://www.google.com" )
43
+ print ("result code : " + str (web_url .getcode ()))
44
+ data = web_url .read ()
45
+ print (data )
46
+
47
+
48
+ def workingWithJSON ():
49
+
50
+ urlData = "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson"
51
+ urlLink = request .urlopen (urlData )
52
+ statusCode = urlLink .getcode ()
53
+
54
+ if statusCode == 200 :
55
+ data = urlLink .read ()
56
+ printResults (data )
57
+ else :
58
+ print ("Received error code : " + statusCode )
59
+
60
+ metacount = 0
61
+
62
+ class MyHTMLParser (HTMLParser ):
63
+
64
+ def handle_comment (self , data ):
65
+ print ("Encountered comment: " , data )
66
+ pos = self .getpos ()
67
+ print ("\t At line: " , pos [0 ], " position " , pos [1 ])
68
+
69
+ def handle_starttag (self , tag , attrs ):
70
+ global metacount
71
+ if tag == "meta" :
72
+ metacount += 1
73
+ print ("Encountered tag: " , tag )
74
+ pos = self .getpos ()
75
+ print ("\t At line: " , pos [0 ], " position " , pos [1 ])
76
+
77
+ if attrs .__len__ () > 0 :
78
+ print ("\t Attributes: " )
79
+ for a in attrs :
80
+ print ("\t " , a [0 ], "=" , a [1 ])
81
+
82
+ def handle_endtag (self , tag ):
83
+ print ("Encountered tag: " , tag )
84
+ pos = self .getpos ()
85
+ print ("\t At line: " , pos [0 ], " position " , pos [1 ])
86
+
87
+ def handle_data (self , data ):
88
+ if data .isspace ():
89
+ return
90
+ print ("Encountered some data: " , data )
91
+ pos = self .getpos ()
92
+ print ("\t At line: " , pos [0 ], " position " , pos [1 ])
93
+
94
+
95
+ def htmlParsing ():
96
+
97
+ parser = MyHTMLParser ()
98
+ file = open ("samplehtml.html" )
99
+ if file .mode == 'r' :
100
+ contents = file .read ()
101
+ parser .feed (contents )
102
+
103
+ print ("Total meta tags found " + str (metacount ))
104
+
105
+
106
+ def xmlParsing ():
107
+ doc = xml .dom .minidom .parse ("samplexml.xml" )
108
+ print (doc .nodeName )
109
+ print (doc .firstChild .tagName )
110
+
111
+ skills = doc .getElementsByTagName ("skill" )
112
+ print ("%d skills : " % skills .length )
113
+ for skill in skills :
114
+ print (skill .getAttribute ("name" ))
115
+
116
+ newSkill = doc .createElement ("skill" )
117
+ newSkill .setAttribute ("name" , "jQuery" )
118
+ doc .firstChild .appendChild (newSkill )
119
+
120
+ skills = doc .getElementsByTagName ("skill" )
121
+ print ("%d skills : " % skills .length )
122
+ for skill in skills :
123
+ print (skill .getAttribute ("name" ))
124
+
125
+
126
+ def main ():
127
+
128
+ fetchHTML ()
129
+ workingWithJSON ()
130
+ htmlParsing ()
131
+ xmlParsing ()
132
+
133
+
134
+ if __name__ == "__main__" :
135
+ main ()
0 commit comments