{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#from the:\n", "http://www.listendata.com/2016/08/dplyr-tutorial.html\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# dplyr package was written by the most popular R programmer Hadley Wickham who has written many \n", "# useful R packages such as ggplot2, tidyr etc.\n", "# Two azaming hire for MST would be \n", "1. Hadley Wickham -> R Fame\n", "2. Wes McKinney -> python panda Fame \n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Installing package into 'C:/Users/v-thbeta/Documents/R/win-library/3.3'\n", "(as 'lib' is unspecified)\n", "also installing the dependencies 'assertthat', 'tibble', 'lazyeval', 'DBI', 'BH'\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "package 'assertthat' successfully unpacked and MD5 sums checked\n", "package 'tibble' successfully unpacked and MD5 sums checked\n", "package 'lazyeval' successfully unpacked and MD5 sums checked\n", "package 'DBI' successfully unpacked and MD5 sums checked\n", "package 'BH' successfully unpacked and MD5 sums checked\n", "package 'dplyr' successfully unpacked and MD5 sums checked\n", "\n", "The downloaded binary packages are in\n", "\tC:\\Users\\v-thbeta\\AppData\\Local\\Temp\\RtmpiObDvN\\downloaded_packages\n" ] } ], "source": [ "install.packages(\"dplyr\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "Attaching package: 'dplyr'\n", "\n", "The following objects are masked from 'package:stats':\n", "\n", " filter, lag\n", "\n", "The following objects are masked from 'package:base':\n", "\n", " intersect, setdiff, setequal, union\n", "\n" ] } ], "source": [ "library(dplyr)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df <- read.csv(\"c:\\\\Users\\\\v-thbeta\\\\Desktop\\\\examples\\\\data\\\\states_income\\\\sampledata.csv\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Observations: 51\n", "Variables: 16\n", "$ Index A, A, A, A, C, C, C, D, D, F, G, H, I, I, I, I, K, K, L, M, ...\n", "$ State Alabama, Alaska, Arizona, Arkansas, California, Colorado, Co...\n", "$ Y2002 1296530, 1170302, 1742027, 1485531, 1685349, 1343824, 1610512...\n", "$ Y2003 1317711, 1960378, 1968140, 1994927, 1675807, 1878473, 1232844...\n", "$ Y2004 1118631, 1818085, 1377583, 1119299, 1889570, 1886149, 1181949...\n", "$ Y2005 1492583, 1447852, 1782199, 1947979, 1480280, 1236697, 1518933...\n", "$ Y2006 1107408, 1861639, 1102568, 1669191, 1735069, 1871471, 1841266...\n", "$ Y2007 1440134, 1465841, 1109382, 1801213, 1812546, 1814218, 1976976...\n", "$ Y2008 1945229, 1551826, 1752886, 1188104, 1487315, 1875146, 1764457...\n", "$ Y2009 1944173, 1436541, 1554330, 1628980, 1663809, 1752387, 1972730...\n", "$ Y2010 1237582, 1629616, 1300521, 1669295, 1624509, 1913275, 1968730...\n", "$ Y2011 1440756, 1230866, 1130709, 1928238, 1639670, 1665877, 1945524...\n", "$ Y2012 1186741, 1512804, 1907284, 1216675, 1921845, 1491604, 1228529...\n", "$ Y2013 1852841, 1985302, 1363279, 1591896, 1156536, 1178355, 1582249...\n", "$ Y2014 1558906, 1580394, 1525866, 1360959, 1388461, 1383978, 1503156...\n", "$ Y2015 1916661, 1979143, 1647724, 1329341, 1644607, 1330736, 1718072...\n" ] } ], "source": [ "glimpse(df)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 51
  2. \n", "\t
  3. 16
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 51\n", "\\item 16\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 51\n", "2. 16\n", "\n", "\n" ], "text/plain": [ "[1] 51 16" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dim(df)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
IndexStateY2002Y2003Y2004Y2005Y2006Y2007Y2008Y2009Y2010Y2011Y2012Y2013Y2014Y2015
6C Colorado 1343824 1878473 1886149 1236697 1871471 1814218 1875146 1752387 1913275 1665877 1491604 1178355 1383978 1330736
32N New Mexico 1819239 1226057 1935991 1124400 1723493 1475985 1237704 1820856 1801430 1653384 1475715 1623388 1533494 1868612
36O Ohio 1802132 1648498 1441386 1670280 1534888 1314824 1516621 1511460 1585465 1887714 1227303 1840898 1880804 1573117
45U Utah 1771096 1195861 1979395 1241662 1437456 1859416 1939284 1915865 1619186 1288285 1108281 1123353 1801019 1729273
41S South Carolina1631522 1803455 1425193 1458191 1538731 1825195 1250499 1864685 1345102 1116203 1532332 1591735 1188417 1110655
\n" ], "text/latex": [ "\\begin{tabular}{r|llllllllllllllll}\n", " & Index & State & Y2002 & Y2003 & Y2004 & Y2005 & Y2006 & Y2007 & Y2008 & Y2009 & Y2010 & Y2011 & Y2012 & Y2013 & Y2014 & Y2015\\\\\n", "\\hline\n", "\t6 & C & Colorado & 1343824 & 1878473 & 1886149 & 1236697 & 1871471 & 1814218 & 1875146 & 1752387 & 1913275 & 1665877 & 1491604 & 1178355 & 1383978 & 1330736 \\\\\n", "\t32 & N & New Mexico & 1819239 & 1226057 & 1935991 & 1124400 & 1723493 & 1475985 & 1237704 & 1820856 & 1801430 & 1653384 & 1475715 & 1623388 & 1533494 & 1868612 \\\\\n", "\t36 & O & Ohio & 1802132 & 1648498 & 1441386 & 1670280 & 1534888 & 1314824 & 1516621 & 1511460 & 1585465 & 1887714 & 1227303 & 1840898 & 1880804 & 1573117 \\\\\n", "\t45 & U & Utah & 1771096 & 1195861 & 1979395 & 1241662 & 1437456 & 1859416 & 1939284 & 1915865 & 1619186 & 1288285 & 1108281 & 1123353 & 1801019 & 1729273 \\\\\n", "\t41 & S & South Carolina & 1631522 & 1803455 & 1425193 & 1458191 & 1538731 & 1825195 & 1250499 & 1864685 & 1345102 & 1116203 & 1532332 & 1591735 & 1188417 & 1110655 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Index State Y2002 Y2003 Y2004 Y2005 Y2006 Y2007 Y2008 \n", "6 C Colorado 1343824 1878473 1886149 1236697 1871471 1814218 1875146\n", "32 N New Mexico 1819239 1226057 1935991 1124400 1723493 1475985 1237704\n", "36 O Ohio 1802132 1648498 1441386 1670280 1534888 1314824 1516621\n", "45 U Utah 1771096 1195861 1979395 1241662 1437456 1859416 1939284\n", "41 S South Carolina 1631522 1803455 1425193 1458191 1538731 1825195 1250499\n", " Y2009 Y2010 Y2011 Y2012 Y2013 Y2014 Y2015 \n", "6 1752387 1913275 1665877 1491604 1178355 1383978 1330736\n", "32 1820856 1801430 1653384 1475715 1623388 1533494 1868612\n", "36 1511460 1585465 1887714 1227303 1840898 1880804 1573117\n", "45 1915865 1619186 1288285 1108281 1123353 1801019 1729273\n", "41 1864685 1345102 1116203 1532332 1591735 1188417 1110655" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sample_n(df,5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
IndexStateY2002Y2003Y2004Y2005Y2006Y2007Y2008Y2009Y2010Y2011Y2012Y2013Y2014Y2015
35N North Dakota 1618807 1510193 1876940 1443172 1425030 1868788 1720352 1671468 1534571 1271132 1430978 1529024 1563898 1604118
34N North Carolina1616742 1292223 1482792 1532347 1158716 1827420 1267737 1116168 1791535 1553750 1472258 1104893 1596452 1229085
18K Kentucky 1813878 1448846 1800760 1250524 1137913 1911227 1301848 1956681 1350895 1512894 1916616 1878271 1722762 1913350
28N Nebraska 1885081 1309769 1425527 1240465 1500594 1278272 1140598 1270585 1128711 1187207 1569665 1690920 1459243 1802211
6C Colorado 1343824 1878473 1886149 1236697 1871471 1814218 1875146 1752387 1913275 1665877 1491604 1178355 1383978 1330736
\n" ], "text/latex": [ "\\begin{tabular}{r|llllllllllllllll}\n", " & Index & State & Y2002 & Y2003 & Y2004 & Y2005 & Y2006 & Y2007 & Y2008 & Y2009 & Y2010 & Y2011 & Y2012 & Y2013 & Y2014 & Y2015\\\\\n", "\\hline\n", "\t35 & N & North Dakota & 1618807 & 1510193 & 1876940 & 1443172 & 1425030 & 1868788 & 1720352 & 1671468 & 1534571 & 1271132 & 1430978 & 1529024 & 1563898 & 1604118 \\\\\n", "\t34 & N & North Carolina & 1616742 & 1292223 & 1482792 & 1532347 & 1158716 & 1827420 & 1267737 & 1116168 & 1791535 & 1553750 & 1472258 & 1104893 & 1596452 & 1229085 \\\\\n", "\t18 & K & Kentucky & 1813878 & 1448846 & 1800760 & 1250524 & 1137913 & 1911227 & 1301848 & 1956681 & 1350895 & 1512894 & 1916616 & 1878271 & 1722762 & 1913350 \\\\\n", "\t28 & N & Nebraska & 1885081 & 1309769 & 1425527 & 1240465 & 1500594 & 1278272 & 1140598 & 1270585 & 1128711 & 1187207 & 1569665 & 1690920 & 1459243 & 1802211 \\\\\n", "\t6 & C & Colorado & 1343824 & 1878473 & 1886149 & 1236697 & 1871471 & 1814218 & 1875146 & 1752387 & 1913275 & 1665877 & 1491604 & 1178355 & 1383978 & 1330736 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Index State Y2002 Y2003 Y2004 Y2005 Y2006 Y2007 Y2008 \n", "35 N North Dakota 1618807 1510193 1876940 1443172 1425030 1868788 1720352\n", "34 N North Carolina 1616742 1292223 1482792 1532347 1158716 1827420 1267737\n", "18 K Kentucky 1813878 1448846 1800760 1250524 1137913 1911227 1301848\n", "28 N Nebraska 1885081 1309769 1425527 1240465 1500594 1278272 1140598\n", "6 C Colorado 1343824 1878473 1886149 1236697 1871471 1814218 1875146\n", " Y2009 Y2010 Y2011 Y2012 Y2013 Y2014 Y2015 \n", "35 1671468 1534571 1271132 1430978 1529024 1563898 1604118\n", "34 1116168 1791535 1553750 1472258 1104893 1596452 1229085\n", "18 1956681 1350895 1512894 1916616 1878271 1722762 1913350\n", "28 1270585 1128711 1187207 1569665 1690920 1459243 1802211\n", "6 1752387 1913275 1665877 1491604 1178355 1383978 1330736" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sample_n(df,5)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dedup = distinct(df)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'data.frame':\t51 obs. of 16 variables:\n", " $ Index: Factor w/ 19 levels \"A\",\"C\",\"D\",\"F\",..: 1 1 1 1 2 2 2 3 3 4 ...\n", " $ State: Factor w/ 51 levels \"Alabama\",\"Alaska\",..: 1 2 3 4 5 6 7 8 9 10 ...\n", " $ Y2002: int 1296530 1170302 1742027 1485531 1685349 1343824 1610512 1330403 1111437 1964626 ...\n", " $ Y2003: int 1317711 1960378 1968140 1994927 1675807 1878473 1232844 1268673 1993741 1468852 ...\n", " $ Y2004: int 1118631 1818085 1377583 1119299 1889570 1886149 1181949 1706751 1374643 1419738 ...\n", " $ Y2005: int 1492583 1447852 1782199 1947979 1480280 1236697 1518933 1403759 1827949 1362787 ...\n", " $ Y2006: int 1107408 1861639 1102568 1669191 1735069 1871471 1841266 1441351 1803852 1339608 ...\n", " $ Y2007: int 1440134 1465841 1109382 1801213 1812546 1814218 1976976 1300836 1595981 1278550 ...\n", " $ Y2008: int 1945229 1551826 1752886 1188104 1487315 1875146 1764457 1762096 1193245 1756185 ...\n", " $ Y2009: int 1944173 1436541 1554330 1628980 1663809 1752387 1972730 1553585 1739748 1818438 ...\n", " $ Y2010: int 1237582 1629616 1300521 1669295 1624509 1913275 1968730 1370984 1707823 1198403 ...\n", " $ Y2011: int 1440756 1230866 1130709 1928238 1639670 1665877 1945524 1318669 1353449 1497051 ...\n", " $ Y2012: int 1186741 1512804 1907284 1216675 1921845 1491604 1228529 1984027 1979708 1131928 ...\n", " $ Y2013: int 1852841 1985302 1363279 1591896 1156536 1178355 1582249 1671279 1912654 1107448 ...\n", " $ Y2014: int 1558906 1580394 1525866 1360959 1388461 1383978 1503156 1803169 1782169 1407784 ...\n", " $ Y2015: int 1916661 1979143 1647724 1329341 1644607 1330736 1718072 1627508 1410183 1170389 ...\n" ] } ], "source": [ "str(dedup)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dedup2 = distinct(df, Index, .keep_all=TRUE)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'data.frame':\t51 obs. of 16 variables:\n", " $ Index: Factor w/ 19 levels \"A\",\"C\",\"D\",\"F\",..: 1 1 1 1 2 2 2 3 3 4 ...\n", " $ State: Factor w/ 51 levels \"Alabama\",\"Alaska\",..: 1 2 3 4 5 6 7 8 9 10 ...\n", " $ Y2002: int 1296530 1170302 1742027 1485531 1685349 1343824 1610512 1330403 1111437 1964626 ...\n", " $ Y2003: int 1317711 1960378 1968140 1994927 1675807 1878473 1232844 1268673 1993741 1468852 ...\n", " $ Y2004: int 1118631 1818085 1377583 1119299 1889570 1886149 1181949 1706751 1374643 1419738 ...\n", " $ Y2005: int 1492583 1447852 1782199 1947979 1480280 1236697 1518933 1403759 1827949 1362787 ...\n", " $ Y2006: int 1107408 1861639 1102568 1669191 1735069 1871471 1841266 1441351 1803852 1339608 ...\n", " $ Y2007: int 1440134 1465841 1109382 1801213 1812546 1814218 1976976 1300836 1595981 1278550 ...\n", " $ Y2008: int 1945229 1551826 1752886 1188104 1487315 1875146 1764457 1762096 1193245 1756185 ...\n", " $ Y2009: int 1944173 1436541 1554330 1628980 1663809 1752387 1972730 1553585 1739748 1818438 ...\n", " $ Y2010: int 1237582 1629616 1300521 1669295 1624509 1913275 1968730 1370984 1707823 1198403 ...\n", " $ Y2011: int 1440756 1230866 1130709 1928238 1639670 1665877 1945524 1318669 1353449 1497051 ...\n", " $ Y2012: int 1186741 1512804 1907284 1216675 1921845 1491604 1228529 1984027 1979708 1131928 ...\n", " $ Y2013: int 1852841 1985302 1363279 1591896 1156536 1178355 1582249 1671279 1912654 1107448 ...\n", " $ Y2014: int 1558906 1580394 1525866 1360959 1388461 1383978 1503156 1803169 1782169 1407784 ...\n", " $ Y2015: int 1916661 1979143 1647724 1329341 1644607 1330736 1718072 1627508 1410183 1170389 ...\n" ] } ], "source": [ "str(dedup)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df2 = select(df, Index, State:Y2008, Y2015)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Expecting to see column Y2015 as part of the results-set" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'data.frame':\t51 obs. of 9 variables:\n", " $ Index: Factor w/ 19 levels \"A\",\"C\",\"D\",\"F\",..: 1 1 1 1 2 2 2 3 3 4 ...\n", " $ State: Factor w/ 51 levels \"Alabama\",\"Alaska\",..: 1 2 3 4 5 6 7 8 9 10 ...\n", " $ Y2002: int 1296530 1170302 1742027 1485531 1685349 1343824 1610512 1330403 1111437 1964626 ...\n", " $ Y2003: int 1317711 1960378 1968140 1994927 1675807 1878473 1232844 1268673 1993741 1468852 ...\n", " $ Y2004: int 1118631 1818085 1377583 1119299 1889570 1886149 1181949 1706751 1374643 1419738 ...\n", " $ Y2005: int 1492583 1447852 1782199 1947979 1480280 1236697 1518933 1403759 1827949 1362787 ...\n", " $ Y2006: int 1107408 1861639 1102568 1669191 1735069 1871471 1841266 1441351 1803852 1339608 ...\n", " $ Y2007: int 1440134 1465841 1109382 1801213 1812546 1814218 1976976 1300836 1595981 1278550 ...\n", " $ Y2008: int 1945229 1551826 1752886 1188104 1487315 1875146 1764457 1762096 1193245 1756185 ...\n" ] } ], "source": [ "str(df2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# A set of verbs for operations such as applying filter, selecting specific columns, \n", "# sorting data, adding or deleting columns and aggregating data. " ] } ], "metadata": { "kernelspec": { "display_name": "R 3.3", "language": "R", "name": "ir33" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.3.2" } }, "nbformat": 4, "nbformat_minor": 2 }