Content-Length: 587366 | pFad | http://github.com/farhanjaved47/python/commit/584597365bcb8620c7201d767eba372bb6788759

05 WebScraping Tutorial · farhanjaved47/python@5845973 · GitHub
Skip to content

Commit 5845973

Browse files
committed
WebScraping Tutorial
1 parent acf0dbf commit 5845973

File tree

3 files changed

+180
-0
lines changed

3 files changed

+180
-0
lines changed

WebScraping.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
"""
2+
Farhan Javed - 11-20-2019
3+
Basic Web scraping tutorial with example for working with JSON.
4+
Also includes examples for working with HTML and XML files.
5+
"""
6+
7+
8+
import urllib.request as request
9+
import json
10+
from html.parser import HTMLParser
11+
import xml.dom.minidom
12+
# Python JSON class : https://docs.python.org/3/library/json.html
13+
14+
15+
def printResults(data):
16+
theJSON = json.loads(data)
17+
18+
if "title" in theJSON["metadata"]:
19+
print(theJSON["metadata"]["title"])
20+
count = theJSON["metadata"]["count"]
21+
print(str(count) + " events recorded")
22+
23+
for i in theJSON["features"]:
24+
print(i["properties"]["place"])
25+
print("-----------------\n")
26+
27+
for i in theJSON["features"]:
28+
if i["properties"]["mag"] >= 4.0:
29+
print("%2.1f" %i["properties"]["mag"], i["properties"]["place"])
30+
print("-----------------\n")
31+
32+
print("Events that were felt: ")
33+
for i in theJSON["features"]:
34+
feltReports = i["properties"]["felt"]
35+
if feltReports != None:
36+
print("%2.1f" %i["properties"]["mag"], i["properties"]["place"],
37+
" reported " + str(feltReports) + " times")
38+
39+
40+
def fetchHTML():
41+
42+
web_url = request.urlopen("http://www.google.com")
43+
print("result code : " + str(web_url.getcode()))
44+
data = web_url.read()
45+
print(data)
46+
47+
48+
def workingWithJSON():
49+
50+
urlData = "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson"
51+
urlLink = request.urlopen(urlData)
52+
statusCode = urlLink.getcode()
53+
54+
if statusCode == 200:
55+
data = urlLink.read()
56+
printResults(data)
57+
else:
58+
print("Received error code : " + statusCode)
59+
60+
metacount = 0
61+
62+
class MyHTMLParser(HTMLParser):
63+
64+
def handle_comment(self, data):
65+
print("Encountered comment: ", data)
66+
pos = self.getpos()
67+
print("\tAt line: ", pos[0], " position ", pos[1])
68+
69+
def handle_starttag(self, tag, attrs):
70+
global metacount
71+
if tag == "meta":
72+
metacount += 1
73+
print("Encountered tag: ", tag)
74+
pos = self.getpos()
75+
print("\tAt line: ", pos[0], " position ", pos[1])
76+
77+
if attrs.__len__() > 0:
78+
print("\tAttributes: ")
79+
for a in attrs:
80+
print("\t", a[0], "=", a[1])
81+
82+
def handle_endtag(self, tag):
83+
print("Encountered tag: ", tag)
84+
pos = self.getpos()
85+
print("\tAt line: ", pos[0], " position ", pos[1])
86+
87+
def handle_data(self, data):
88+
if data.isspace():
89+
return
90+
print("Encountered some data: ", data)
91+
pos = self.getpos()
92+
print("\tAt line: ", pos[0], " position ", pos[1])
93+
94+
95+
def htmlParsing():
96+
97+
parser = MyHTMLParser()
98+
file = open("samplehtml.html")
99+
if file.mode == 'r':
100+
contents = file.read()
101+
parser.feed(contents)
102+
103+
print("Total meta tags found " + str(metacount))
104+
105+
106+
def xmlParsing():
107+
doc = xml.dom.minidom.parse("samplexml.xml")
108+
print(doc.nodeName)
109+
print(doc.firstChild.tagName)
110+
111+
skills = doc.getElementsByTagName("skill")
112+
print("%d skills : " %skills.length)
113+
for skill in skills:
114+
print(skill.getAttribute("name"))
115+
116+
newSkill = doc.createElement("skill")
117+
newSkill.setAttribute("name", "jQuery")
118+
doc.firstChild.appendChild(newSkill)
119+
120+
skills = doc.getElementsByTagName("skill")
121+
print("%d skills : " % skills.length)
122+
for skill in skills:
123+
print(skill.getAttribute("name"))
124+
125+
126+
def main():
127+
128+
fetchHTML()
129+
workingWithJSON()
130+
htmlParsing()
131+
xmlParsing()
132+
133+
134+
if __name__ == "__main__":
135+
main()

samplehtml.html

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="utf-8" />
5+
<title>Sample HTML Document</title>
6+
<meta name="description" content="This is a sample HTML file" />
7+
<meta name="author" content="Administrator" />
8+
<meta name="viewport" content="width=device-width; initial-scale=1.0" />
9+
<!-- Replace favicon.ico & apple-touch-icon.png in the root of your domain and delete these references -->
10+
<link rel="shortcut icon" href="/favicon.ico" />
11+
<link rel="apple-touch-icon" href="/apple-touch-icon.png" />
12+
</head>
13+
14+
<body>
15+
<div>
16+
<header>
17+
<h1>HTML Sample File</h1>
18+
</header>
19+
<nav>
20+
<p>
21+
<a href="/">Home</a>
22+
</p>
23+
<p>
24+
<a href="/contact">Contact</a>
25+
</p>
26+
</nav>
27+
<div>
28+
29+
</div>
30+
<footer>
31+
<p>&copy; Copyright by Administrator</p>
32+
</footer>
33+
</div>
34+
</body>
35+
</html>

samplexml.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<?xml version="1.0" encoding="UTF-8" ?>
2+
<person>
3+
<firstname>Joe</firstname>
4+
<lastname>Marini</lastname>
5+
<home>Seattle</home>
6+
<skill name="JavaScript"/>
7+
<skill name="Python"/>
8+
<skill name="C#"/>
9+
<skill name="HTML"/>
10+
</person>

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/farhanjaved47/python/commit/584597365bcb8620c7201d767eba372bb6788759

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy