{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Advanced Topics on Files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with HTML files\n", "- fetch an HTML page from web\n", "- parse the HTML file with BeautifulSoup library" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('teaching.html', )" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fetch an html page\n", "import urllib.request\n", "url = 'https://rambasnet.github.io/teaching.html'\n", "localfile = 'teaching.html'\n", "urllib.request.urlretrieve(url, localfile)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There are 10165 words in the file.\n" ] } ], "source": [ "with open(localfile) as f:\n", " data = f.read()\n", "words = data.split(' ')\n", "print('There are {0} words in the file.'.format(len(words)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## parsing HTML using BeautifulSoup library\n", "- install Beautifulsoup library\n", " $ pip install bs4\n", "- https://www.crummy.com/software/BeautifulSoup/bs4/doc/#\n", "- Alternative is nltk (Natural Language Toolkit) library\n", "- http://www.nltk.org/" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: bs4 in /Users/rbasnet/miniconda3/lib/python3.7/site-packages (0.0.1)\n", "Requirement already satisfied: beautifulsoup4 in /Users/rbasnet/miniconda3/lib/python3.7/site-packages (from bs4) (4.7.1)\n", "Requirement already satisfied: soupsieve>=1.2 in /Users/rbasnet/miniconda3/lib/python3.7/site-packages (from beautifulsoup4->bs4) (1.9.1)\n" ] } ], "source": [ "# can run terminal/bash commands from notebook using !\n", "! pip install bs4" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Ram Basnet | Homepage\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Dr. Ram Basnet\n", "Associate Professor of Computer Science\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Home\n", "\n", "\n", "Teaching\n", "\n", "\n", "Research\n", "\n", "\n", "Resources\n", "\n", "\n", "Contact\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Teaching\n", "\n", "\n", "\n", "\n", "Teaching Interests\n", "\n", "Cyber Security\n", "\n", " Python, C++, Java, Database, JavaScript\n", " \n", "Data Science\n", "\n", " Web Design and Secure Web App Development\n", " \n", "\n", "\n", "Courses Taught at CMU\n", "\n", " CSCI 106 - Web Page I\n", " 6\n", "\n", "\n", " CSCI 110 - Beg. Prog. Python & Lab\n", " 6\n", "\n", "\n", " CS1 - Foundation of CS\n", " 7\n", "\n", "\n", " CS2 - Data Structures\n", " 7\n", "\n", "\n", " CSCI 206 - Web Page II\n", " 2\n", "\n", "\n", " CS3 - Intro to Algorithms\n", " 2\n", "\n", "\n", " CSCI 310 - Adv. Prog. Python\n", " 7\n", "\n", "\n", " CSCI 420 - Cyber Security\n", " 5\n", "\n", "\n", " CSCI 465 - Net/App Security\n", " 5\n", "\n", "\n", "\n", "\n", "CURRENT SCHEDULE\n", "\n", "\n", "\n", "\n", "\n", "\n", "Mon\n", "Tues\n", "Wed\n", "Thrs\n", "Fri\n", "\n", "\n", "\n", "\n", "8:00\n", "\n", " CS0WS 120\n", " \n", "\n", " CS0-LWS 120\n", " \n", "\n", " CS0WS 120\n", " \n", "\n", " CS0-LWS 120\n", " \n", "\n", " CS0WS 120\n", " \n", "\n", "\n", "8:30\n", "\n", "\n", "9:00\n", "\n", " Ad PyWS 118\n", " \n", "\n", " Off. Hr.CH 321\n", " \n", "\n", " Ad PyWS 118\n", " \n", "\n", " Off. Hr.CH 321\n", " \n", "\n", " Off. Hr.WS 116 (CRL)\n", " \n", "\n", "\n", "9:30\n", "\n", "\n", "10:00\n", "\n", "\n", "\n", "\n", " Off. Hr.CH 321\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "10:30\n", "\n", "\n", "11:00\n", "\n", " Net/App SecWS 205\n", " \n", "\n", "\n", "\n", "\n", " Net/App SecWS 205\n", " \n", "\n", "\n", "\n", "\n", " Net/App SecWS 205\n", " \n", "\n", "\n", "11:30\n", "\n", "\n", "12:00\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "12:30\n", "\n", "\n", "1:00\n", "\n", " CS 3CH 276\n", " \n", "\n", "\n", "\n", "\n", " CS 3CH 276\n", " \n", "\n", "\n", "\n", "\n", " CS 3CH 276\n", " \n", "\n", "\n", "1:30\n", "\n", "\n", "2:00\n", "\n", " Off. Hr.CH 321\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2:30\n", "\n", "\n", "3:00\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "3:30\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Home | Teaching |\n", " Research |\n", " Resources |\n", " Contact       ©\n", " 2019\n", "\n", "\n", "\n", " var dt = new Date()\n", " document.getElementById(\"year\").innerHTML = dt.getFullYear()\n", " var windowSize = window.matchMedia(\"(max-width: 375px)\")\n", " if (windowSize.matches) {\n", " var element = document.getElementById(\"cmu-logo\")\n", " element.setAttribute(\"style\", \"visibility: hidden;\")\n", " }\n", "\n", "\n", " /*\n", " var navul = document.getElementById(\"navul\")\n", " var alists = navul.getElementsByClassName(\"nav-link\")\n", " for (var i = 0; i < alists.length; i++) {\n", " alists[i].addEventListener(\"click\", function() {\n", " var current = document.getElementsByClassName(\"active\")\n", " current[0].className = current[0].className.replace(\" active\", \"\")\n", " this.className += \" active\"\n", " })\n", " }\n", " */\n", " var hrefString = document.location.href\n", " ? document.location.href\n", " : document.location\n", " var url = hrefString.split(\"/\") //replace string with location.href\n", " var navLinks = document\n", " .getElementById(\"navul\")\n", " .getElementsByClassName(\"nav-item\")\n", " var currentPage = url[url.length - 1]\n", " for (var i = 0; i < navLinks.length; i++) {\n", " var link = navLinks[i].getElementsByClassName(\"nav-link\")[0]\n", " var lb = link.href.split(\"/\")\n", " if (lb[lb.length - 1] == currentPage) {\n", " navLinks[i].className += \" active\"\n", " } else {\n", " navLinks[i].className = navLinks[i].className.replace(\" active\", \"\")\n", " }\n", " }\n", "\n", "\n", "\n", "\n", "\n", " ;(function(i, s, o, g, r, a, m) {\n", " i[\"GoogleAnalyticsObject\"] = r\n", " ;(i[r] =\n", " i[r] ||\n", " function() {\n", " ;(i[r].q = i[r].q || []).push(arguments)\n", " }),\n", " (i[r].l = 1 * new Date())\n", " ;(a = s.createElement(o)), (m = s.getElementsByTagName(o)[0])\n", " a.async = 1\n", " a.src = g\n", " m.parentNode.insertBefore(a, m)\n", " })(\n", " window,\n", " document,\n", " \"script\",\n", " \"//www.google-analytics.com/analytics.js\",\n", " \"ga\"\n", " )\n", "\n", " ga(\"create\", \"UA-46738331-1\", \"coloradomesa.edu\")\n", " ga(\"send\", \"pageview\")\n", "\n", "\n", "\n", "\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "localfile = 'teaching.html'\n", "with open(localfile) as f:\n", " soup = BeautifulSoup(f.read(), 'lxml')\n", "text = soup.get_text()\n", "print(text)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# break into lines and remove leading and trailing space on each line\n", "lines = [line.strip() for line in text.splitlines()]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['', '', 'Ram Basnet | Homepage', '', '', '', '', '', '', '', '', '', '', '', '', 'Dr. Ram Basnet', 'Associate Professor of Computer Science', '', '', '']\n" ] } ], "source": [ "print(lines[:20])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# create list of words by spliting multi-word elements\n", "words = [word.strip().lower() for line in lines for word in line.split()]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['ram', 'basnet', '|', 'homepage', 'dr.', 'ram', 'basnet', 'associate', 'professor', 'of', 'computer', 'science', 'home', 'teaching', 'research', 'resources', 'contact', 'teaching', 'teaching', 'interests']\n" ] } ], "source": [ "print(words[:20])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There are 367 words in the file.\n" ] } ], "source": [ "print('There are {0} words in the file.'.format(len(words)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Find histogram of words\n", "- use DefaultDict found in collections module\n", "- https://docs.python.org/3/library/collections.html" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "hist = defaultdict(int)\n", "for w in words:\n", " hist[w] += 1" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# print top 10 most common words\n", "listHist = [(k, v) for k, v in hist.items()]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('ram', 2), ('basnet', 2), ('|', 5), ('homepage', 1), ('dr.', 1), ('associate', 1), ('professor', 1), ('of', 2), ('computer', 1), ('science', 2)]\n" ] } ], "source": [ "print(listHist[:10])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "listHist.sort(key = lambda x: x[1], reverse=True)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('=', 25), ('var', 12), ('-', 11), ('{', 8), ('csci', 6), ('|', 5), ('i', 5), ('120', 5), ('off.', 5), ('}', 5)]\n" ] } ], "source": [ "print(listHist[:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## working with binary files\n", "- the following example copies a binary file such as image" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "fileSrc = './resources/brain.jpg'\n", "fileDst = 'brain-copy.jpg'\n", "with open(fileSrc, 'rb') as rbf: \n", " #rb - read binary mode\n", " data = rbf.read() # read the whole binary file\n", " with open(fileDst, 'wb') as wbf:\n", " wbf.write(data) # write the whole binary file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## use checksum to compare if two files match exactly!\n", "- checksum makes sure that not a single bit is different between the two files\n", "- used in security\n", "- import and use hashlib - https://docs.python.org/3/library/hashlib.html" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "two files checksums match!\n" ] } ], "source": [ "import hashlib\n", "file1Contents = open(fileSrc, 'rb').read()\n", "file2Contents = open(fileDst, 'rb').read()\n", "\n", "file1ChkSum = hashlib.sha256(file1Contents).hexdigest()\n", "file2ChkSum = hashlib.sha256(file2Contents).hexdigest()\n", "if (file1ChkSum == file2ChkSum):\n", " print('two files checksums match!')\n", "else:\n", " print('oops! two files checksums do NOT match!')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Python object serialization with pickle library\n", "- https://docs.python.org/3/library/pickle.html\n", "- pickle module implements binary protocols for serializing and de-serializing a Python object\n", "- Pickling - serializing python object\n", "- Un-pickling - de-serializing python object (inverse operation)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "alist = list(range(2, 21, 2))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]\n" ] } ], "source": [ "print(alist)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "# lets pickle alist\n", "pickleFile = 'myPickle.pkl'\n", "with open(pickleFile, 'wb') as p:\n", " pickle.dump(alist, p)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "# lets unpickle alist\n", "with open(pickleFile, 'rb') as p:\n", " blist = pickle.load(p)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alist == blist" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }