{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 012_importing_datasets\n", "\n", "[Source](https://github.com/iArunava/Python-TheNoTheoryGuide/)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Required Imports\n", "import pandas as pd\n", "import sklearn as sk\n", "import sqlite3\n", "from pandas.io import sql" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Importing CSV files from local directory\n", "# NOTE: Make sure the Path you use contains the dataset named 'whereisthatdataset.csv'\n", "\n", "df1 = pd.read_csv ('./assets/whereisthatdataset.csv') # Using relative path\n", "df2 = pd.read_csv ('/home/arunava/Datasets/whereisthatdataset.csv') # Using absolute path\n", "\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# If a dataset comes without headers then you need to pass `headers=None`\n", "# Note: This Dataset comes with headers, \n", "# specifying `headers=None` leads python to treat the first row as part of the dataset\n", "\n", "df1 = pd.read_csv ('./assets/whereisthatdataset.csv', header=None)\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Specify header names while importing datasets with (or without) headers\n", "df1 = pd.read_csv ('./assets/whereisthatdataset.csv', header=None, names=['Where', 'on', 'earth', 'did', 'you', 'got', 'this', 'dataset', 'of', 'Pigeons', 'racing'])\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Importing file from URL\n", "df1 = pd.read_csv('https://raw.githubusercontent.com/iArunava/Python-TheNoTheoryGuide/master/assets/whereisthatdataset.csv')\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Reading Data from text file\n", "# NOTE: Use `sep` to specify how your data is seperated\n", "\n", "df1 = pd.read_table ('./assets/whereisthatdataset.txt', sep=',')\n", "df2 = pd.read_csv ('./assets/whereisthatdataset.txt', sep=',')\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read excel file\n", "# NOTE: you need 'xlrd' module to read .xls files\n", "\n", "df1 = pd.read_excel ('./assets/whereisthatdataset.xls', sheetname='whereisthatdataset', skiprows=1)\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read SAS file\n", "df1 = pd.read_sas ('./assets/whereisthatdataset.sas7bdat')\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read SQL Table\n", "\n", "conn = sqlite3.connect ('./assets/whereisthatdataset.db')\n", "query = 'SELECT * FROM whereisthattable;'\n", "df1 = pd.read_sql(query, con=conn)\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read sample rows and columns\n", "# nrows: Number of rows to select\n", "# usecols: list of cols to use (either all string or unicode)\n", "\n", "sdf1 = pd.read_csv ('./assets/whereisthatdataset.csv', nrows=4, usecols=[1, 5, 7])\n", "sdf2 = pd.read_csv ('./assets/whereisthatdataset.csv', nrows=4, usecols=['Breeder', 'Sex', 'Arrival'])\n", "sdf1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Skip rows while importing\n", "# NOTE: If you don't set header=None, pandas will treat the first row of all the rows to be considered as the header row\n", "\n", "df1 = pd.read_csv ('./assets/whereisthatdataset.csv', header=None, skiprows=5)\n", "df1.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Specify Missing Values\n", "# na_values: pass a list, which if present in the dataset will be considered as missing values\n", "\n", "df1 = pd.read_csv ('./assets/whereisthatdataset.csv', na_values=['NaN'])\n", "df1.head(3)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }