WebScraping Tutorial

farhanjaved47 · farhanjaved47 · commit 584597365bcb · 2019-11-20T12:31:33.000+05:00
diff --git a/WebScraping.py b/WebScraping.py
@@ -0,0 +1,135 @@
+"""
+Farhan Javed - 11-20-2019
+Basic Web scraping tutorial with example for working with JSON.
+Also includes examples for working with HTML and XML files.
+"""
+
+
+import urllib.request as request
+import json
+from html.parser import HTMLParser
+import xml.dom.minidom
+#  Python JSON class : https://docs.python.org/3/library/json.html
+
+
+def printResults(data):
+    theJSON = json.loads(data)
+
+    if "title" in theJSON["metadata"]:
+        print(theJSON["metadata"]["title"])
+        count = theJSON["metadata"]["count"]
+        print(str(count) + " events recorded")
+
+        for i in theJSON["features"]:
+            print(i["properties"]["place"])
+        print("-----------------\n")
+
+        for i in theJSON["features"]:
+            if i["properties"]["mag"] >= 4.0:
+                print("%2.1f" %i["properties"]["mag"], i["properties"]["place"])
+        print("-----------------\n")
+
+        print("Events that were felt: ")
+        for i in theJSON["features"]:
+            feltReports = i["properties"]["felt"]
+            if feltReports != None:
+                print("%2.1f" %i["properties"]["mag"], i["properties"]["place"],
+                      " reported " + str(feltReports) + " times")
+
+
+def fetchHTML():
+
+    web_url = request.urlopen("http://www.google.com")
+    print("result code : " + str(web_url.getcode()))
+    data = web_url.read()
+    print(data)
+
+
+def workingWithJSON():
+
+    urlData = "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson"
+    urlLink = request.urlopen(urlData)
+    statusCode = urlLink.getcode()
+
+    if statusCode == 200:
+        data = urlLink.read()
+        printResults(data)
+    else:
+        print("Received error code : " + statusCode)
+
+metacount = 0
+
+class MyHTMLParser(HTMLParser):
+
+    def handle_comment(self, data):
+        print("Encountered comment: ", data)
+        pos = self.getpos()
+        print("\tAt line: ", pos[0], " position ", pos[1])
+
+    def handle_starttag(self, tag, attrs):
+        global metacount
+        if tag == "meta":
+            metacount += 1
+        print("Encountered tag: ", tag)
+        pos = self.getpos()
+        print("\tAt line: ", pos[0], " position ", pos[1])
+
+        if attrs.__len__() > 0:
+            print("\tAttributes: ")
+            for a in attrs:
+                print("\t", a[0], "=", a[1])
+
+    def handle_endtag(self, tag):
+        print("Encountered tag: ", tag)
+        pos = self.getpos()
+        print("\tAt line: ", pos[0], " position ", pos[1])
+
+    def handle_data(self, data):
+        if data.isspace():
+            return
+        print("Encountered some data: ", data)
+        pos = self.getpos()
+        print("\tAt line: ", pos[0], " position ", pos[1])
+
+
+def htmlParsing():
+
+    parser = MyHTMLParser()
+    file = open("samplehtml.html")
+    if file.mode == 'r':
+        contents = file.read()
+        parser.feed(contents)
+
+    print("Total meta tags found " + str(metacount))
+
+
+def xmlParsing():
+    doc = xml.dom.minidom.parse("samplexml.xml")
+    print(doc.nodeName)
+    print(doc.firstChild.tagName)
+
+    skills = doc.getElementsByTagName("skill")
+    print("%d skills : " %skills.length)
+    for skill in skills:
+        print(skill.getAttribute("name"))
+
+    newSkill = doc.createElement("skill")
+    newSkill.setAttribute("name", "jQuery")
+    doc.firstChild.appendChild(newSkill)
+
+    skills = doc.getElementsByTagName("skill")
+    print("%d skills : " % skills.length)
+    for skill in skills:
+        print(skill.getAttribute("name"))
+
+
+def main():
+
+    fetchHTML()
+    workingWithJSON()
+    htmlParsing()
+    xmlParsing()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samplehtml.html b/samplehtml.html
@@ -0,0 +1,35 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>Sample HTML Document</title>
+    <meta name="description" content="This is a sample HTML file" />
+    <meta name="author" content="Administrator" />
+    <meta name="viewport" content="width=device-width; initial-scale=1.0" />
+    <!-- Replace favicon.ico & apple-touch-icon.png in the root of your domain and delete these references -->
+    <link rel="shortcut icon" href="/favicon.ico" />
+    <link rel="apple-touch-icon" href="/apple-touch-icon.png" />
+  </head>
+
+  <body>
+    <div>
+      <header>
+        <h1>HTML Sample File</h1>
+      </header>
+      <nav>
+        <p>
+          <a href="/">Home</a>
+        </p>
+        <p>
+          <a href="/contact">Contact</a>
+        </p>
+      </nav>
+      <div>
+
+      </div>
+      <footer>
+        <p>&copy; Copyright by Administrator</p>
+      </footer>
+    </div>
+  </body>
+</html>
diff --git a/samplexml.xml b/samplexml.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<person>
+  <firstname>Joe</firstname>
+  <lastname>Marini</lastname>
+  <home>Seattle</home>
+  <skill name="JavaScript"/>
+  <skill name="Python"/>
+  <skill name="C#"/>
+  <skill name="HTML"/>
+</person>