{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Start with a table of data, we'll make up an employee database and a sales database" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "using CSV, DataFramesMeta, Statistics, Dates" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

8 rows × 5 columns

idfirst_namelast_namedepartmentsalary
Int64StringStringStringInt64
11MichaelScottManagement & Admin5100
22DwightSchruteSales4200
33AngelaMartinAccounting3750
44JimHalpertSales4300
55PamBeeslyManagement & Admin2200
66OscarNunezAccounting3400
77MeredithPalmerPurchasing3300
88CreedBrattonPurchasing3200
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& id & first\\_name & last\\_name & department & salary\\\\\n", "\t\\hline\n", "\t& Int64 & String & String & String & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & Michael & Scott & Management \\& Admin & 5100 \\\\\n", "\t2 & 2 & Dwight & Schrute & Sales & 4200 \\\\\n", "\t3 & 3 & Angela & Martin & Accounting & 3750 \\\\\n", "\t4 & 4 & Jim & Halpert & Sales & 4300 \\\\\n", "\t5 & 5 & Pam & Beesly & Management \\& Admin & 2200 \\\\\n", "\t6 & 6 & Oscar & Nunez & Accounting & 3400 \\\\\n", "\t7 & 7 & Meredith & Palmer & Purchasing & 3300 \\\\\n", "\t8 & 8 & Creed & Bratton & Purchasing & 3200 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m8×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m first_name \u001b[0m\u001b[1m last_name \u001b[0m\u001b[1m department \u001b[0m\u001b[1m salary \u001b[0m\n", "\u001b[1m \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼──────────────────────────────────────────────────────────\n", " 1 │ 1 Michael Scott Management & Admin 5100\n", " 2 │ 2 Dwight Schrute Sales 4200\n", " 3 │ 3 Angela Martin Accounting 3750\n", " 4 │ 4 Jim Halpert Sales 4300\n", " 5 │ 5 Pam Beesly Management & Admin 2200\n", " 6 │ 6 Oscar Nunez Accounting 3400\n", " 7 │ 7 Meredith Palmer Purchasing 3300\n", " 8 │ 8 Creed Bratton Purchasing 3200" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# DataFrame(column=data)\n", "df = DataFrame(id=1:8,\n", " first_name=[\"Michael\", \"Dwight\", \"Angela\", \"Jim\", \"Pam\", \"Oscar\", \"Meredith\", \"Creed\"],\n", " last_name=[\"Scott\", \"Schrute\", \"Martin\", \"Halpert\", \"Beesly\", \"Nunez\", \"Palmer\", \"Bratton\"],\n", " department=[\"Management & Admin\", \"Sales\", \"Accounting\", \"Sales\", \"Management & Admin\", \"Accounting\", \n", " \"Purchasing\", \"Purchasing\"],\n", " salary=[5100, 4200, 3750, 4300, 2200, 3400, 3300, 3200])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's create a sales database called `db_sales` with client information (thanks to [this site](https://theoffice.fandom.com/wiki/Clients_of_Dunder_Mifflin))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m6×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m transaction_date \u001b[0m\u001b[1m employee_id \u001b[0m\u001b[1m quantity \u001b[0m\u001b[1m customer \u001b[0m\n", "\u001b[1m \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m Date \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\n", "─────┼───────────────────────────────────────────────────────────────────────────────────\n", " 1 │ 1 2006-01-02 4 100 Dunmore High School\n", " 2 │ 2 2006-01-29 2 500 Harper Collins\n", " 3 │ 3 2006-02-01 4 600 Blue Cross of Pennsylvania\n", " 4 │ 4 2006-02-14 2 200 Apex Technology\n", " 5 │ 5 2006-03-01 4 400 Blue Cross of Pennsylvania\n", " 6 │ 6 2006-03-20 2 250 Stone, Cooper, and Grandy: Attor…" ] } ], "source": [ "# Parse dates as Date objects\n", "dates = [\"1-2-2006\", \"1-29-2006\", \"2-1-2006\", \"2-14-2006\", \"3-1-2006\", \"3-20-2006\"]\n", "dates = parse.(Date, dates, dateformat\"m-d-y\")\n", "\n", "db_sales = DataFrame(id=1:6,\n", " transaction_date=dates,\n", " employee_id=[4, 2, 4, 2, 4, 2],\n", " quantity=[100, 500, 600, 200, 400, 250],\n", " customer=[\"Dunmore High School\", \"Harper Collins\", \"Blue Cross of Pennsylvania\", \n", " \"Apex Technology\", \"Blue Cross of Pennsylvania\", \n", " \"Stone, Cooper, and Grandy: Attorneys at Law\"])\n", "\n", "db_sales |> print" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

1 rows × 5 columns

idfirst_namelast_namedepartmentsalary
Int64StringStringStringInt64
11MichaelScottManagement & Admin5100
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& id & first\\_name & last\\_name & department & salary\\\\\n", "\t\\hline\n", "\t& Int64 & String & String & String & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & Michael & Scott & Management \\& Admin & 5100 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m1×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m first_name \u001b[0m\u001b[1m last_name \u001b[0m\u001b[1m department \u001b[0m\u001b[1m salary \u001b[0m\n", "\u001b[1m \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼──────────────────────────────────────────────────────────\n", " 1 │ 1 Michael Scott Management & Admin 5100" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#DataFrames\n", "subset(df, \n", " :id => ByRow(x -> x >= 1),\n", " :first_name => ByRow(x -> x ==\"Michael\")\n", " )" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

2 rows × 5 columns

idfirst_namelast_namedepartmentsalary
Int64StringStringStringInt64
11MichaelScottManagement & Admin5100
22DwightSchruteSales4200
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& id & first\\_name & last\\_name & department & salary\\\\\n", "\t\\hline\n", "\t& Int64 & String & String & String & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & Michael & Scott & Management \\& Admin & 5100 \\\\\n", "\t2 & 2 & Dwight & Schrute & Sales & 4200 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m first_name \u001b[0m\u001b[1m last_name \u001b[0m\u001b[1m department \u001b[0m\u001b[1m salary \u001b[0m\n", "\u001b[1m \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼──────────────────────────────────────────────────────────\n", " 1 │ 1 Michael Scott Management & Admin 5100\n", " 2 │ 2 Dwight Schrute Sales 4200" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#With DataFramesMeta it's easier to layer on complexity like do 'or' statements a little easier, plus it's more terse\n", "@rsubset(df,\n", " (:id == 2) | (:first_name ==\"Michael\")\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Regex string matching\n", "\n", "From Regex String Matching Docs: [regex literals](https://docs.julialang.org/en/v1/manual/strings/#man-regex-literals)\n", "\n", "```julia\n", "julia> occursin(r\"^\\s*(?:#|$)\", \"# a comment\")\n", "true\n", "```\n", "\n", "Similar to `pandas` `pd.Series.str.contains()`, we use the base `occursin()` string function here to the powerful `subset` or `rsubset` functions." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

2 rows × 5 columns

idfirst_namelast_namedepartmentsalary
Int64StringStringStringInt64
11MichaelScottManagement & Admin5100
25PamBeeslyManagement & Admin2200
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& id & first\\_name & last\\_name & department & salary\\\\\n", "\t\\hline\n", "\t& Int64 & String & String & String & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & Michael & Scott & Management \\& Admin & 5100 \\\\\n", "\t2 & 5 & Pam & Beesly & Management \\& Admin & 2200 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m first_name \u001b[0m\u001b[1m last_name \u001b[0m\u001b[1m department \u001b[0m\u001b[1m salary \u001b[0m\n", "\u001b[1m \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼──────────────────────────────────────────────────────────\n", " 1 │ 1 Michael Scott Management & Admin 5100\n", " 2 │ 5 Pam Beesly Management & Admin 2200" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#DataFrames\n", "subset(df, :department => ByRow(x -> occursin(\"Admin\", x)))\n", "\n", "#DataFramesMeta\n", "@rsubset(df, occursin(\"Admin\", :department))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## GROUP BY\n", "\n", "Grouping and aggregating with `groupby`, `combine`, `@combine`, and `@by`" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

4 rows × 3 columns

departmentAverage Salarycount
StringFloat64Int64
1Management & Admin3650.02
2Sales4250.02
3Accounting3575.02
4Purchasing3250.02
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& department & Average Salary & count\\\\\n", "\t\\hline\n", "\t& String & Float64 & Int64\\\\\n", "\t\\hline\n", "\t1 & Management \\& Admin & 3650.0 & 2 \\\\\n", "\t2 & Sales & 4250.0 & 2 \\\\\n", "\t3 & Accounting & 3575.0 & 2 \\\\\n", "\t4 & Purchasing & 3250.0 & 2 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m4×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m department \u001b[0m\u001b[1m Average Salary \u001b[0m\u001b[1m count \u001b[0m\n", "\u001b[1m \u001b[0m│\u001b[90m String \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼───────────────────────────────────────────\n", " 1 │ Management & Admin 3650.0 2\n", " 2 │ Sales 4250.0 2\n", " 3 │ Accounting 3575.0 2\n", " 4 │ Purchasing 3250.0 2" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gd = groupby(df, :department)\n", "#DataFrames\n", "combine(gd, :salary => mean => :\"Average Salary\", \n", " :department => length => :count)\n", "\n", "#DataFramesMeta\n", "@combine(gd, :\"Average Salary\" = mean(:salary),\n", " :count = length(:department))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "bonus: counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively, instead of doing `groupby()` and then `combine()`, you can do `@by` as a shorthand:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

4 rows × 3 columns

departmentAverage Salarycount
StringFloat64Int64
1Management & Admin3650.02
2Sales4250.02
3Accounting3575.02
4Purchasing3250.02
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& department & Average Salary & count\\\\\n", "\t\\hline\n", "\t& String & Float64 & Int64\\\\\n", "\t\\hline\n", "\t1 & Management \\& Admin & 3650.0 & 2 \\\\\n", "\t2 & Sales & 4250.0 & 2 \\\\\n", "\t3 & Accounting & 3575.0 & 2 \\\\\n", "\t4 & Purchasing & 3250.0 & 2 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m4×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m department \u001b[0m\u001b[1m Average Salary \u001b[0m\u001b[1m count \u001b[0m\n", "\u001b[1m \u001b[0m│\u001b[90m String \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼───────────────────────────────────────────\n", " 1 │ Management & Admin 3650.0 2\n", " 2 │ Sales 4250.0 2\n", " 3 │ Accounting 3575.0 2\n", " 4 │ Purchasing 3250.0 2" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@by(df, :department, \n", " :\"Average Salary\" = mean(:salary), \n", " :count=length(:salary))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PARTITION BY" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

8 rows × 6 columns

idfirst_namelast_namedepartmentsalaryAverage Salary
Int64StringStringStringInt64Float64
11MichaelScottManagement & Admin51003650.0
22DwightSchruteSales42004250.0
33AngelaMartinAccounting37503575.0
44JimHalpertSales43004250.0
55PamBeeslyManagement & Admin22003650.0
66OscarNunezAccounting34003575.0
77MeredithPalmerPurchasing33003250.0
88CreedBrattonPurchasing32003250.0
" ], "text/latex": [ "\\begin{tabular}{r|cccccc}\n", "\t& id & first\\_name & last\\_name & department & salary & Average Salary\\\\\n", "\t\\hline\n", "\t& Int64 & String & String & String & Int64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 1 & Michael & Scott & Management \\& Admin & 5100 & 3650.0 \\\\\n", "\t2 & 2 & Dwight & Schrute & Sales & 4200 & 4250.0 \\\\\n", "\t3 & 3 & Angela & Martin & Accounting & 3750 & 3575.0 \\\\\n", "\t4 & 4 & Jim & Halpert & Sales & 4300 & 4250.0 \\\\\n", "\t5 & 5 & Pam & Beesly & Management \\& Admin & 2200 & 3650.0 \\\\\n", "\t6 & 6 & Oscar & Nunez & Accounting & 3400 & 3575.0 \\\\\n", "\t7 & 7 & Meredith & Palmer & Purchasing & 3300 & 3250.0 \\\\\n", "\t8 & 8 & Creed & Bratton & Purchasing & 3200 & 3250.0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m8×6 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m first_name \u001b[0m\u001b[1m last_name \u001b[0m\u001b[1m department \u001b[0m\u001b[1m salary \u001b[0m\u001b[1m Average Salar\u001b[0m ⋯\n", "\u001b[1m \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m ⋯\n", "─────┼──────────────────────────────────────────────────────────────────────────\n", " 1 │ 1 Michael Scott Management & Admin 5100 3650. ⋯\n", " 2 │ 2 Dwight Schrute Sales 4200 4250.\n", " 3 │ 3 Angela Martin Accounting 3750 3575.\n", " 4 │ 4 Jim Halpert Sales 4300 4250.\n", " 5 │ 5 Pam Beesly Management & Admin 2200 3650. ⋯\n", " 6 │ 6 Oscar Nunez Accounting 3400 3575.\n", " 7 │ 7 Meredith Palmer Purchasing 3300 3250.\n", " 8 │ 8 Creed Bratton Purchasing 3200 3250.\n", "\u001b[36m 1 column omitted\u001b[0m" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#DataFrames\n", "transform(gd, :salary => mean => :\"Average Salary\")\n", "\n", "#DataFramesMeta\n", "@transform(gd, :\"Average Salary\"=mean(:salary))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## JOIN\n", "\n", "Joins are fairly self explanatory - the `on` argument is nice and flexible where you can pass it a column or pairs of columns (i.e. `[:left=>:right]`) for the mapping. In the case of duplicate column names, you can set `makeunique=true` to append a string when there are conflicts:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

12 rows × 9 columns (omitted printing of 2 columns)

idfirst_namelast_namedepartmentsalaryid_1transaction_date
Int64StringStringStringInt64Int64?Date?
12DwightSchruteSales420022006-01-29
22DwightSchruteSales420042006-02-14
32DwightSchruteSales420062006-03-20
44JimHalpertSales430012006-01-02
54JimHalpertSales430032006-02-01
64JimHalpertSales430052006-03-01
71MichaelScottManagement & Admin5100missingmissing
83AngelaMartinAccounting3750missingmissing
95PamBeeslyManagement & Admin2200missingmissing
106OscarNunezAccounting3400missingmissing
117MeredithPalmerPurchasing3300missingmissing
128CreedBrattonPurchasing3200missingmissing
" ], "text/latex": [ "\\begin{tabular}{r|cccccccc}\n", "\t& id & first\\_name & last\\_name & department & salary & id\\_1 & transaction\\_date & \\\\\n", "\t\\hline\n", "\t& Int64 & String & String & String & Int64 & Int64? & Date? & \\\\\n", "\t\\hline\n", "\t1 & 2 & Dwight & Schrute & Sales & 4200 & 2 & 2006-01-29 & $\\dots$ \\\\\n", "\t2 & 2 & Dwight & Schrute & Sales & 4200 & 4 & 2006-02-14 & $\\dots$ \\\\\n", "\t3 & 2 & Dwight & Schrute & Sales & 4200 & 6 & 2006-03-20 & $\\dots$ \\\\\n", "\t4 & 4 & Jim & Halpert & Sales & 4300 & 1 & 2006-01-02 & $\\dots$ \\\\\n", "\t5 & 4 & Jim & Halpert & Sales & 4300 & 3 & 2006-02-01 & $\\dots$ \\\\\n", "\t6 & 4 & Jim & Halpert & Sales & 4300 & 5 & 2006-03-01 & $\\dots$ \\\\\n", "\t7 & 1 & Michael & Scott & Management \\& Admin & 5100 & \\emph{missing} & \\emph{missing} & $\\dots$ \\\\\n", "\t8 & 3 & Angela & Martin & Accounting & 3750 & \\emph{missing} & \\emph{missing} & $\\dots$ \\\\\n", "\t9 & 5 & Pam & Beesly & Management \\& Admin & 2200 & \\emph{missing} & \\emph{missing} & $\\dots$ \\\\\n", "\t10 & 6 & Oscar & Nunez & Accounting & 3400 & \\emph{missing} & \\emph{missing} & $\\dots$ \\\\\n", "\t11 & 7 & Meredith & Palmer & Purchasing & 3300 & \\emph{missing} & \\emph{missing} & $\\dots$ \\\\\n", "\t12 & 8 & Creed & Bratton & Purchasing & 3200 & \\emph{missing} & \\emph{missing} & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m12×9 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m first_name \u001b[0m\u001b[1m last_name \u001b[0m\u001b[1m department \u001b[0m\u001b[1m salary \u001b[0m\u001b[1m id_1 \u001b[0m\u001b[1m tran\u001b[0m ⋯\n", "\u001b[1m \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64? \u001b[0m\u001b[90m Date\u001b[0m ⋯\n", "─────┼──────────────────────────────────────────────────────────────────────────\n", " 1 │ 2 Dwight Schrute Sales 4200 2 2006 ⋯\n", " 2 │ 2 Dwight Schrute Sales 4200 4 2006\n", " 3 │ 2 Dwight Schrute Sales 4200 6 2006\n", " 4 │ 4 Jim Halpert Sales 4300 1 2006\n", " 5 │ 4 Jim Halpert Sales 4300 3 2006 ⋯\n", " 6 │ 4 Jim Halpert Sales 4300 5 2006\n", " 7 │ 1 Michael Scott Management & Admin 5100 \u001b[90m missing \u001b[0m\u001b[90m miss\u001b[0m\n", " 8 │ 3 Angela Martin Accounting 3750 \u001b[90m missing \u001b[0m\u001b[90m miss\u001b[0m\n", " 9 │ 5 Pam Beesly Management & Admin 2200 \u001b[90m missing \u001b[0m\u001b[90m miss\u001b[0m ⋯\n", " 10 │ 6 Oscar Nunez Accounting 3400 \u001b[90m missing \u001b[0m\u001b[90m miss\u001b[0m\n", " 11 │ 7 Meredith Palmer Purchasing 3300 \u001b[90m missing \u001b[0m\u001b[90m miss\u001b[0m\n", " 12 │ 8 Creed Bratton Purchasing 3200 \u001b[90m missing \u001b[0m\u001b[90m miss\u001b[0m\n", "\u001b[36m 3 columns omitted\u001b[0m" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "leftjoin(df, db_sales, on=[:id=>:employee_id], makeunique=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Putting it all together\n", "\n", "### `@chain`\n", "\n", "The `@chain` macro allows you to chain things together so you don't need to keep specifying the first argument in the function calls. This is similar to method chaining in python or the pipe in `dplyr` in R." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

2 rows × 6 columns

idfirst_namelast_namedepartmenttotal_quantitynumber_of_customers
Int64StringStringStringInt64?Int64?
14JimHalpertSales11003
22DwightSchruteSales9503
" ], "text/latex": [ "\\begin{tabular}{r|cccccc}\n", "\t& id & first\\_name & last\\_name & department & total\\_quantity & number\\_of\\_customers\\\\\n", "\t\\hline\n", "\t& Int64 & String & String & String & Int64? & Int64?\\\\\n", "\t\\hline\n", "\t1 & 4 & Jim & Halpert & Sales & 1100 & 3 \\\\\n", "\t2 & 2 & Dwight & Schrute & Sales & 950 & 3 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×6 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m first_name \u001b[0m\u001b[1m last_name \u001b[0m\u001b[1m department \u001b[0m\u001b[1m total_quantity \u001b[0m\u001b[1m number_of_cus\u001b[0m ⋯\n", "\u001b[1m \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64? \u001b[0m\u001b[90m Int64? \u001b[0m ⋯\n", "─────┼──────────────────────────────────────────────────────────────────────────\n", " 1 │ 4 Jim Halpert Sales 1100 ⋯\n", " 2 │ 2 Dwight Schrute Sales 950\n", "\u001b[36m 1 column omitted\u001b[0m" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales = @chain db_sales begin\n", " groupby(:employee_id)\n", " @combine(:total_quantity=sum(:quantity), \n", " :number_of_customers=length(:customer))\n", "end\n", "\n", "result = @chain df begin\n", " @select(:id, :first_name, :last_name, :department)\n", " @rsubset(:department==\"Sales\")\n", " leftjoin(sales, on=[:id=>:employee_id])\n", " @orderby(sort(:total_quantity, rev=true)) # order by descending\n", " end" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.7.2", "language": "julia", "name": "julia-1.7" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.7.2" } }, "nbformat": 4, "nbformat_minor": 4 }