{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import sys\n", "\n", "# optimus\n", "sys.path.append(\"..\")\n", "\n", "# pypika\n", "sys.path.append(\"../../pypika\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pypika import Query, Table, Field, analytics as an\n", "from pypika.dialects import MySQLQuery as Query" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT MIN('RATING'),MAX('RATING') FROM `main.music`\n" ] } ], "source": [ "agg = [an.Min('RATING'),an.Max('RATING')]\n", "q = Query.from_('main.music').select(*agg)\n", "print(q.get_sql())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'SELECT MIN(RATING) FROM main.music'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"SELECT MIN(RATING) FROM main.music\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT `fruits`.`id`,`fruits`.`name` FROM `fruits` JOIN `consumers` ON `fruits`.`consumer_id`=`consumers`.`id` WHERE `fruits`.`harvest_date`+INTERVAL 1 DAY\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmin_sql\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"id\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 48\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 49\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 50\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlog_time\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\dataframe\\columns.py\u001b[0m in \u001b[0;36mmin_sql\u001b[1;34m(columns)\u001b[0m\n\u001b[0;32m 578\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;32mreturn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 579\u001b[0m \"\"\"\n\u001b[1;32m--> 580\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0magg_exprs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0man\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mMin\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 581\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 582\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\dataframe\\columns.py\u001b[0m in \u001b[0;36magg_exprs\u001b[1;34m(columns, funcs, tidy, *args)\u001b[0m\n\u001b[0;32m 533\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;32mreturn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 534\u001b[0m \"\"\"\n\u001b[1;32m--> 535\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mexec_agg\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcreate_exprs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfuncs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtidy\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 536\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 537\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0madd_attr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\dataframe\\columns.py\u001b[0m in \u001b[0;36mexec_agg\u001b[1;34m(exprs, tidy)\u001b[0m\n\u001b[0;32m 553\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexprs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 554\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 555\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0magg\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mexprs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 556\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 557\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_col_names_funcs_to_keys\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36magg\u001b[1;34m(self, *exprs)\u001b[0m\n\u001b[0;32m 1325\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mRow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1326\u001b[0m \"\"\"\n\u001b[1;32m-> 1327\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroupBy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0magg\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mexprs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1328\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1329\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0msince\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2.0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pyspark\\sql\\group.py\u001b[0m in \u001b[0;36magg\u001b[1;34m(self, *exprs)\u001b[0m\n\u001b[0;32m 89\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 90\u001b[0m \u001b[1;31m# Columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 91\u001b[1;33m \u001b[1;32massert\u001b[0m \u001b[0mall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mColumn\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mexprs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"all exprs should be Column\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 92\u001b[0m jdf = self._jgd.agg(exprs[0]._jc,\n\u001b[0;32m 93\u001b[0m _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))\n", "\u001b[1;31mAssertionError\u001b[0m: all exprs should be Column" ] } ], "source": [ "df.cols.min_sql(\"id\")" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "57458" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.max(\"id\")" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'name': 'string',\n", " 'id': 'int',\n", " 'nametype': 'string',\n", " 'recclass': 'string',\n", " 'mass (g)': 'double',\n", " 'fall': 'string',\n", " 'year': 'string',\n", " 'reclat': 'double',\n", " 'reclong': 'double',\n", " 'GeoLocation': 'string'}" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.dtypes()" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 0, 'mass (g)': 19}" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.count_zeros([\"id\",\"mass (g)\"])" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 42365, 'mass (g)': 12497}" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cols.count_uniques([\"id\",\"mass (g)\"])" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+-----------+\n", "|count(1)| recclass|\n", "+--------+-----------+\n", "| 8285| L6|\n", "| 7142| H5|\n", "| 4796| L5|\n", "| 4528| H6|\n", "| 4211| H4|\n", "| 2766| LL5|\n", "| 2043| LL6|\n", "| 1253| L4|\n", "| 428| H4/5|\n", "| 416| CM2|\n", "| 386| H3|\n", "| 365| L3|\n", "| 335| CO3|\n", "| 300| Ureilite|\n", "| 285|Iron, IIIAB|\n", "| 268| LL4|\n", "| 256| CV3|\n", "| 241| Diogenite|\n", "| 240| Howardite|\n", "| 225| LL|\n", "+--------+-----------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "df.query(\"SELECT COUNT(*), recclass from df GROUP BY recclass ORDER BY COUNT(*) DESC\").show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }