{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Machine Learning for Engineers: [GatherData](https://www.apmonitor.com/pds/index.php/Main/GatherData)\n", "\n", "- [Data Gathering and Consolidation](https://www.apmonitor.com/pds/index.php/Main/GatherData)\n", " - Source Blocks: 23\n", " - Description: Import and Export data in Python for consolidating disparate data (Excel spreadsheet, CSV file, PDF report, database, cloud storage) into a single repository.\n", "- [Course Overview](https://apmonitor.com/pds)\n", "- [Course Schedule](https://apmonitor.com/pds/index.php/Main/CourseSchedule)\n", "\n", "" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "tx = np.linspace(0,1,8); x = np.cos(tx)\n", "dx = pd.DataFrame({'Time':tx,'x':x})\n", "dx.to_csv('dx.csv',index=False)\n", "print(dx)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# local source\n", "pd.read_csv('dx.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# online source\n", "pd.read_csv('http://apmonitor.com/pds/uploads/Main/dx.txt')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url = 'http://apmonitor.com/pds/index.php/Main/GatherData'\n", "table = pd.read_html(url)\n", "print(table)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# write DataFrame dx\n", "dx.to_clipboard()\n", "dx.to_csv()\n", "dx.to_dict()\n", "dx.to_excel()\n", "dx.to_feather()\n", "dx.to_gbq()\n", "dx.to_hdf()\n", "dx.to_html()\n", "dx.to_json()\n", "dx.to_latex()\n", "dx.to_markdown()\n", "dx.to_parquet()\n", "dx.to_pickle()\n", "dx.to_records()\n", "dx.to_sql()\n", "dx.to_stata()\n", "dx.to_string()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "d = np.genfromtxt('dx.csv',delimiter=',')\n", "print(d)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "d = np.loadtxt('dx.csv',delimiter=',',skiprows=1)\n", "print(d)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = open('dx.csv', 'r')\n", "for x in f:\n", " print(x)\n", "f.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = open('dx.csv', 'r')\n", "x = f.read(4); print(x)\n", "f.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = open('dx.csv', 'r')\n", "print(f.readline())\n", "print(f.readline())\n", "print(f.readline())\n", "f.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "url = 'http://apmonitor.com/pds/index.php/Main/GatherData?action=print'\n", "page = requests.get(url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "soup = BeautifulSoup(page.content, 'html.parser')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(soup.title.text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for link in soup.find_all('a'):\n", " print('Link Text: {}'.format(link.text))\n", " print('href: {}'.format(link.get('href')))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "tx = np.linspace(0,1,4); x = np.cos(tx)\n", "dx = pd.DataFrame({'Time':tx,'x':x})\n", "\n", "ty = np.linspace(0,1,3); y = np.sin(ty)\n", "dy = pd.DataFrame({'Time':ty,'y':y})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dx = dx.set_index('Time')\n", "dy = dy.set_index('Time')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z = dx.join(dy)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z = dx.join(dy,how='right')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z = dx.join(dy,how='inner')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z = dx.join(dy,how='outer',sort=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "tx = np.linspace(0,1,4); x = np.cos(tx)\n", "dx = pd.DataFrame({'Time':tx,'x':x})\n", "tx = np.linspace(0,1,3)\n", "x = np.cos(tx)\n", "dy = pd.DataFrame({'Time':tx,'x':x})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dx.append(dy)\\\n", " .sort_values(by='Time')\\\n", " .drop_duplicates(subset='Time')\\\n", " .reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# stock ticker symbol\n", "url = 'http://apmonitor.com/pds/uploads/Main/goog.txt'\n", "\n", "# import data with pandas\n", "data = pd.read_csv(url)\n", "print(data.describe())\n", "\n", "# calculate change and volatility\n", "data['Change'] = data['Close']-data['Open']\n", "data['Volatility'] = data['High']-data['Low']\n", "analysis = ['Open','Volume','Volatility','Change']\n", "sns.heatmap(data[analysis].corr())\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }