{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Postgres\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# NBVAL_IGNORE_OUTPUT\n", "from sqlalchemy import sql\n", "from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey\n", "from sqlalchemy import create_engine\n", "import os\n", "\n", "port = os.environ.get(\"SB_TEST_PGPORT\", \"5432\")\n", "pwd = os.environ.get(\"SB_TEST_PGPASSWORD\", \"\")\n", "engine = create_engine('postgresql://postgres:%s@localhost:%s/postgres'%(pwd, port), echo=False)\n", "\n", "\n", "metadata = MetaData()\n", "users = Table('users', metadata,\n", " Column('id', Integer, primary_key=True),\n", " Column('name', String),\n", " Column('fullname', String),\n", ")\n", "\n", "\n", "addresses = Table('addresses', metadata,\n", " Column('id', Integer, primary_key=True),\n", " Column('user_id', None, ForeignKey('users.id')),\n", " Column('email_address', String, nullable=False)\n", " )\n", "\n", "metadata.drop_all(engine)\n", "metadata.create_all(engine)\n", "\n", "conn = engine.connect()\n", "\n", "ins = users.insert().values(name='jack', fullname='Jack Jones')\n", "result = conn.execute(ins)\n", "\n", "\n", "ins = users.insert()\n", "conn.execute(ins, id=2, name='wendy', fullname='Wendy Williams')\n", "\n", "\n", "conn.execute(addresses.insert(), [\n", " {'user_id': 1, 'email_address' : 'jack@yahoo.com'},\n", " {'user_id': 1, 'email_address' : 'jack@msn.com'},\n", " {'user_id': 2, 'email_address' : 'www@www.org'},\n", " {'user_id': 2, 'email_address' : 'wendy@aol.com'},\n", "])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Big Example" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.num \n", "FROM (SELECT id, user_id, email_address, num, min(anon_2.id) OVER (PARTITION BY anon_2.user_id) AS win1 \n", "FROM (SELECT id, user_id, email_address, dense_rank() OVER (PARTITION BY addresses.user_id ORDER BY addresses.id) AS num \n", "FROM addresses) AS anon_2) AS anon_1 \n", "WHERE anon_1.id > anon_1.win1 AND (anon_1.email_address LIKE 'jack' || '%%')\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/machow/Dropbox/Repo/siuba/siuba/sql/translate.py:31: SiubaSqlRuntimeWarning: \n", "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", "\n", "This warning only displays once per function\n", " SiubaSqlRuntimeWarning\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_addressnum
021jack@msn.com2
\n", "
" ], "text/plain": [ " id user_id email_address num\n", "0 2 1 jack@msn.com 2" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from siuba import *\n", "from siuba.sql.verbs import LazyTbl, collect, show_query\n", "from siuba.sql.dply.vector import dense_rank\n", "import siuba.meta_hook.sqlalchemy.sql.functions as F\n", "\n", "from sqlalchemy import sql\n", "\n", "tbl_addresses = LazyTbl(conn, addresses)\n", "tbl_users = LazyTbl(conn, users)\n", "\n", "#tbl_addresses >> mutate(_, num = dense_rank(_.id)) >> show_query(_)\n", "q = (tbl_addresses\n", " >> group_by(\"user_id\")\n", " >> mutate(num = dense_rank(_.id))\n", " >> filter(\n", " _.id > _.id.min(),\n", " _.email_address.str.startswith(\"jack\")\n", " )\n", " >> ungroup()\n", " >> show_query(simplify = True)\n", " >> collect()\n", " )\n", "\n", "q" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mutate" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id, addresses.user_id, addresses.email_address, dense_rank() OVER (ORDER BY addresses.id) + 1 AS rank \n", "FROM addresses\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/machow/Dropbox/Repo/siuba/siuba/sql/translate.py:31: SiubaSqlRuntimeWarning: \n", "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", "\n", "This warning only displays once per function\n", " SiubaSqlRuntimeWarning\n" ] } ], "source": [ "q = (tbl_addresses\n", " >> mutate(rank = dense_rank(_.id) + 1)\n", " >> show_query()\n", " )" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.id > dense_rank() OVER (PARTITION BY addresses.user_id, addresses.user_id, addresses.user_id ORDER BY addresses.id) + 1 AS rank \n", "FROM addresses\n" ] } ], "source": [ "q = (tbl_addresses\n", " >> group_by(\"user_id\")\n", " >> mutate(rank = _.id > dense_rank(_.id) + 1)\n", " >> show_query()\n", " )" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.email, anon_1.is_mikey, anon_1.is_mikey + 1 AS mikey2 \n", "FROM (SELECT anon_2.email AS email, (anon_2.email LIKE 'mikey' || '%%') AS is_mikey \n", "FROM (SELECT addresses.email_address AS email \n", "FROM addresses) AS anon_2) AS anon_1\n" ] } ], "source": [ "# rename and first mutate in same query,\n", "# second mutate is outer query (since uses to prev col)\n", "q = (tbl_addresses\n", " >> select(_.email == _.email_address)\n", " >> mutate(is_mikey = _.email.str.startswith(\"mikey\"), mikey2 = _.is_mikey + 1)\n", " >> show_query()\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filter" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.id, anon_1.user_id, anon_1.email_address \n", "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address \n", "FROM addresses) AS anon_1 \n", "WHERE anon_1.id > 1\n" ] } ], "source": [ "q = (tbl_addresses\n", " >> filter(_.id > 1)\n", " >> show_query()\n", " )" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.id, anon_1.user_id, anon_1.email_address \n", "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address \n", "FROM addresses) AS anon_1 \n", "WHERE anon_1.id > 1\n" ] } ], "source": [ "q = (tbl_addresses\n", " >> group_by(\"user_id\")\n", " >> filter(_.id > 1)\n", " >> show_query()\n", " )\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.id, anon_1.user_id, anon_1.email_address \n", "FROM (SELECT anon_2.id AS id, anon_2.user_id AS user_id, anon_2.email_address AS email_address, dense_rank() OVER (PARTITION BY anon_2.user_id ORDER BY anon_2.id) AS win1 \n", "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address \n", "FROM addresses) AS anon_2) AS anon_1 \n", "WHERE anon_1.win1 > 1\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_address
021jack@msn.com
142wendy@aol.com
\n", "
" ], "text/plain": [ " id user_id email_address\n", "0 2 1 jack@msn.com\n", "1 4 2 wendy@aol.com" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = (tbl_addresses\n", " >> group_by(\"user_id\")\n", " >> filter(dense_rank(_.id) > 1)\n", " >> show_query()\n", " >> collect()\n", " )\n", "\n", "q" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summarize" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.user_id, avg(addresses.id) AS avg_id \n", "FROM addresses GROUP BY addresses.user_id\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idavg_id
023.5
111.5
\n", "
" ], "text/plain": [ " user_id avg_id\n", "0 2 3.5\n", "1 1 1.5" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = (tbl_addresses\n", " >> group_by(\"user_id\")\n", " >> summarize(avg_id = _.id.mean())\n", " >> show_query()\n", " >> collect()\n", " )\n", "\n", "q" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT avg(anon_1.id2) AS m_id \n", "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.id + 1 AS id2 \n", "FROM addresses) AS anon_1\n" ] } ], "source": [ "q = (tbl_addresses >> mutate(_, id2 = _.id + 1) >> summarize(_, m_id = _.id2.mean())) >> show_query()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.user_id, anon_1.id, count(*) AS n \n", "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address \n", "FROM addresses) AS anon_1 GROUP BY anon_1.user_id, anon_1.id ORDER BY n DESC\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_ididn
0121
1231
2241
3111
\n", "
" ], "text/plain": [ " user_id id n\n", "0 1 2 1\n", "1 2 3 1\n", "2 2 4 1\n", "3 1 1 1" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = (tbl_addresses\n", " >> group_by(\"user_id\")\n", " >> count(_.id)\n", " >> show_query()\n", " >> collect()\n", ")\n", "\n", "q" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Joins" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_2.fullname, anon_2.name \n", "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address \n", "FROM addresses) AS anon_1 LEFT OUTER JOIN (SELECT users.id AS id, users.name AS name, users.fullname AS fullname \n", "FROM users) AS anon_2 ON anon_1.user_id = anon_2.id\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_addressfullnamename
011jack@yahoo.comJack Jonesjack
121jack@msn.comJack Jonesjack
232www@www.orgWendy Williamswendy
342wendy@aol.comWendy Williamswendy
\n", "
" ], "text/plain": [ " id user_id email_address fullname name\n", "0 1 1 jack@yahoo.com Jack Jones jack\n", "1 2 1 jack@msn.com Jack Jones jack\n", "2 3 2 www@www.org Wendy Williams wendy\n", "3 4 2 wendy@aol.com Wendy Williams wendy" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# TODO: not executable like this, how to get first SELECT out of parens?\n", "# E.g. can use users.join, etc..\n", "q = (tbl_addresses\n", " >> left_join(_, tbl_users, {\"user_id\": \"id\"})\n", " >> show_query()\n", " >> collect()\n", " )\n", "\n", "q" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## case_when" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id, addresses.user_id, addresses.email_address, CASE WHEN (addresses.id > 20) THEN 0 WHEN (addresses.id > 1) THEN 1 ELSE addresses.id END AS label \n", "FROM addresses\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_addresslabel
011jack@yahoo.com1
121jack@msn.com1
232www@www.org1
342wendy@aol.com1
\n", "
" ], "text/plain": [ " id user_id email_address label\n", "0 1 1 jack@yahoo.com 1\n", "1 2 1 jack@msn.com 1\n", "2 3 2 www@www.org 1\n", "3 4 2 wendy@aol.com 1" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## TODO: fix sql case_when statements\n", "q = (tbl_addresses\n", " >> mutate(\n", " label = case_when(_, {\n", " _.id > 20: 0,\n", " _.id > 1: 1,\n", " True: _.id\n", " })\n", " )\n", " >> show_query()\n", " >> collect()\n", " )\n", "\n", "q" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.id, anon_1.user_id, anon_1.email_address \n", "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address \n", "FROM addresses) AS anon_1 \n", "WHERE CASE WHEN (anon_1.id > 20) THEN true WHEN (anon_1.id > 1) THEN false ELSE true END\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_address
011jack@yahoo.com
\n", "
" ], "text/plain": [ " id user_id email_address\n", "0 1 1 jack@yahoo.com" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## TODO: fix sql case_when statements\n", "# works, but better to just use filter normally...\n", "q = (tbl_addresses\n", " >> filter(\n", " case_when(_, {\n", " _.id > 20: True,\n", " _.id > 1: False,\n", " True: True\n", " })\n", " )\n", " >> show_query()\n", " >> collect()\n", " )\n", "\n", "q" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id, addresses.user_id, addresses.email_address, CASE WHEN (addresses.id > avg(addresses.id) OVER (PARTITION BY addresses.user_id)) THEN 0 WHEN (addresses.id > 20) THEN 1 ELSE addresses.id END AS label \n", "FROM addresses\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_addresslabel
011jack@yahoo.com1
121jack@msn.com0
232www@www.org3
342wendy@aol.com0
\n", "
" ], "text/plain": [ " id user_id email_address label\n", "0 1 1 jack@yahoo.com 1\n", "1 2 1 jack@msn.com 0\n", "2 3 2 www@www.org 3\n", "3 4 2 wendy@aol.com 0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = (tbl_addresses\n", " >> group_by(\"user_id\")\n", " >> mutate(\n", " label = case_when(_, {\n", " _.id > _.id.mean(): 0,\n", " _.id > 20: 1,\n", " True: _.id\n", " })\n", " )\n", " >> show_query()\n", " >> collect()\n", " )\n", "\n", "q" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "█─'__call__'\n", "├─\n", "├─_\n", "└─█─''\n", " └─█─'__call__'\n", " ├─\n", " └─{_.id > 1: 'yeah', True: 'no'}" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# NBVAL_IGNORE_OUTPUT\n", "case_when(_, {_.id > 1: \"yeah\", True: \"no\"})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## if_else" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id, addresses.user_id, addresses.email_address, CASE WHEN (dense_rank() OVER (ORDER BY addresses.id) > 1) THEN 'yes' ELSE 'no' END AS big_id \n", "FROM addresses\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/machow/Dropbox/Repo/siuba/siuba/sql/translate.py:31: SiubaSqlRuntimeWarning: \n", "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", "\n", "This warning only displays once per function\n", " SiubaSqlRuntimeWarning\n" ] } ], "source": [ "q = (tbl_addresses\n", " >> mutate(big_id = if_else(dense_rank(_.id) > 1, \"yes\", \"no\"))\n", " >> show_query()\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Head" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id, addresses.user_id, addresses.email_address \n", "FROM addresses \n", " LIMIT 3\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_address
011jack@yahoo.com
121jack@msn.com
232www@www.org
\n", "
" ], "text/plain": [ " id user_id email_address\n", "0 1 1 jack@yahoo.com\n", "1 2 1 jack@msn.com\n", "2 3 2 www@www.org" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(tbl_addresses\n", " >> head(3)\n", " >> show_query()\n", " >> collect()\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Rename" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id AS id2, addresses.user_id, addresses.email_address \n", "FROM addresses\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id2user_idemail_address
011jack@yahoo.com
121jack@msn.com
232www@www.org
342wendy@aol.com
\n", "
" ], "text/plain": [ " id2 user_id email_address\n", "0 1 1 jack@yahoo.com\n", "1 2 1 jack@msn.com\n", "2 3 2 www@www.org\n", "3 4 2 wendy@aol.com" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(tbl_addresses\n", " >> rename(id2 = \"id\")\n", " >> show_query()\n", " >> collect()\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Distinct" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT DISTINCT addresses.user_id, addresses.user_id + 1 AS user_id2 \n", "FROM addresses\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iduser_id2
023
112
\n", "
" ], "text/plain": [ " user_id user_id2\n", "0 2 3\n", "1 1 2" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(tbl_addresses\n", " >> distinct(_.user_id, user_id2 = _.user_id + 1)\n", " >> show_query()\n", " >> collect()\n", " )" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT DISTINCT addresses.email_address, dense_rank() OVER (PARTITION BY addresses.user_id ORDER BY addresses.user_id) AS user_id2 \n", "FROM addresses\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/machow/Dropbox/Repo/siuba/siuba/sql/translate.py:31: SiubaSqlRuntimeWarning: \n", "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", "\n", "This warning only displays once per function\n", " SiubaSqlRuntimeWarning\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
email_addressuser_id2
0jack@msn.com1
1jack@yahoo.com1
2www@www.org1
3wendy@aol.com1
\n", "
" ], "text/plain": [ " email_address user_id2\n", "0 jack@msn.com 1\n", "1 jack@yahoo.com 1\n", "2 www@www.org 1\n", "3 wendy@aol.com 1" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(tbl_addresses\n", " >> group_by(\"user_id\")\n", " >> distinct(_.email_address, user_id2 = dense_rank(_.user_id))\n", " >> show_query()\n", " >> collect()\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Technical" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Translating symbolic function calls" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idn
022
112
\n", "
" ], "text/plain": [ " user_id n\n", "0 2 2\n", "1 1 2" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from siuba.dply.vector import n\n", "\n", "tbl_addresses \\\n", " >> group_by(\"user_id\") \\\n", " >> summarize(n = n(_)) \\\n", " >> collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Translating str methods" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Translating dt methods" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id, addresses.user_id, addresses.email_address, EXTRACT(hour FROM addresses.id) AS hour \n", "FROM addresses\n" ] } ], "source": [ "q = tbl_addresses >> mutate(hour = _.id.dt.hour) >> show_query()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SQL escapes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Window functions" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, sum(anon_1.user_id) OVER (ORDER BY anon_1.id DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumsum \n", "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address \n", "FROM addresses ORDER BY addresses.id DESC) AS anon_1 ORDER BY cumsum\n" ] }, { "data": { "text/html": [ "
# Source: lazy query\n",
       "# DB Conn: Engine(postgresql://postgres:***@localhost:5432/postgres)\n",
       "# Preview:\n",
       "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_addresscumsum
042wendy@aol.com2
132www@www.org4
221jack@msn.com5
311jack@yahoo.com6
\n", "

# .. may have more rows

" ], "text/plain": [ "# Source: lazy query\n", "# DB Conn: Engine(postgresql://postgres:***@localhost:5432/postgres)\n", "# Preview:\n", " id user_id email_address cumsum\n", "0 4 2 wendy@aol.com 2\n", "1 3 2 www@www.org 4\n", "2 2 1 jack@msn.com 5\n", "3 1 1 jack@yahoo.com 6\n", "# .. may have more rows" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from siuba.dply.vector import desc\n", "(tbl_addresses\n", " >> arrange(desc(_.id))\n", " >> mutate(cumsum = _.user_id.cumsum())\n", " >> arrange(_.cumsum)\n", " >> show_query()\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Misc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## postgres specific" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT addresses.id, addresses.user_id, addresses.email_address, round(CAST(addresses.id AS NUMERIC), 2) AS id2 \n", "FROM addresses\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iduser_idemail_addressid2
011jack@yahoo.com1.0
121jack@msn.com2.0
232www@www.org3.0
342wendy@aol.com4.0
\n", "
" ], "text/plain": [ " id user_id email_address id2\n", "0 1 1 jack@yahoo.com 1.0\n", "1 2 1 jack@msn.com 2.0\n", "2 3 2 www@www.org 3.0\n", "3 4 2 wendy@aol.com 4.0" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(tbl_addresses\n", " >> mutate(id2 = _.id.round(2))\n", " >> show_query()\n", " >> collect()\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## autoload table w/ sqlalchemy" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT users.id, users.name, users.fullname, users.id + 1 AS id2 \n", "FROM users\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamefullnameid2
01jackJack Jones2
12wendyWendy Williams3
\n", "
" ], "text/plain": [ " id name fullname id2\n", "0 1 jack Jack Jones 2\n", "1 2 wendy Wendy Williams 3" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import sqlalchemy\n", "\n", "metadata2 = MetaData()\n", "\n", "users2 = sqlalchemy.Table('users', metadata2, autoload = True, autoload_with = engine)\n", "tbl_users2 = LazyTbl(conn, users2)\n", "\n", "(tbl_users\n", " >> mutate(id2 = _.id + 1)\n", " >> show_query()\n", " >> collect()\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## auto table from string" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT users.id, users.name, users.fullname, users.id + 1 AS id2 \n", "FROM users\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamefullnameid2
01jackJack Jones2
12wendyWendy Williams3
\n", "
" ], "text/plain": [ " id name fullname id2\n", "0 1 jack Jack Jones 2\n", "1 2 wendy Wendy Williams 3" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import sqlalchemy\n", "\n", "metadata3 = MetaData()\n", "\n", "tbl_users3 = LazyTbl(conn, \"users\")\n", "\n", "(tbl_users\n", " >> mutate(id2 = _.id + 1)\n", " >> show_query()\n", " >> collect()\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## LazyTbl repr" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
# Source: lazy query\n",
       "# DB Conn: Engine(postgresql://postgres:***@localhost:5432/postgres)\n",
       "# Preview:\n",
       "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamefullnameid2
01jackJack Jones2
12wendyWendy Williams3
\n", "

# .. may have more rows

" ], "text/plain": [ "# Source: lazy query\n", "# DB Conn: Engine(postgresql://postgres:***@localhost:5432/postgres)\n", "# Preview:\n", " id name fullname id2\n", "0 1 jack Jack Jones 2\n", "1 2 wendy Wendy Williams 3\n", "# .. may have more rows" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tbl_users >> mutate(id2 = _.id + 1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "165px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 4 }