{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" }, "orig_nbformat": 2, "kernelspec": { "name": "python371jvsc74a57bd0e09bc63938c7c12a1d261af93fef4c49c4bc4ec4b59849b02ca92b0abe12c74c", "display_name": "Python 3.7.1 64-bit ('env': conda)" }, "colab": { "name": "Read data from cassandra.ipynb", "provenance": [] } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "Ezeu9nViz3yv" }, "source": [ "# Read Cassandra as DataFrame\n", "> Short guide on how to read data from Cassandra into pandas dataframe format\n", "\n", "- toc: true\n", "- badges: true\n", "- comments: true\n", "- categories: [Cassandra]\n", "- image:" ] }, { "cell_type": "code", "metadata": { "id": "fYlxGtNWzxrz" }, "source": [ "import os\n", "from cassandra.cqlengine.models import Model\n", "from cassandra.cqlengine import columns\n", "from datetime import datetime\n", "import pandas as pd" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "PUfn2tsyzxr4" }, "source": [ "import os\n", "from datetime import datetime\n", "\n", "from cassandra.cqlengine.management import sync_table\n", "from cassandra.policies import TokenAwarePolicy\n", "from cassandra.auth import PlainTextAuthProvider\n", "from cassandra.cluster import (\n", " Cluster,\n", " DCAwareRoundRobinPolicy\n", ")\n", "from cassandra.cqlengine.connection import (\n", " register_connection,\n", " set_default_connection\n", ")" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "J_ZJSfPWzxr6" }, "source": [ "CASSANDRA_USERNAME='cassandra'\n", "CASSANDRA_PASSWORD='cassandra'\n", "CASSANDRA_HOST='127.0.0.1'\n", "CASSANDRA_PORT=9042" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "vVnyNK7Szxr7", "outputId": "be54579a-11b7-4496-be81-a3de8cb9e543" }, "source": [ "session = None\n", "cluster = None\n", "\n", "auth_provider = PlainTextAuthProvider(username=CASSANDRA_USERNAME, password=CASSANDRA_PASSWORD)\n", "cluster = Cluster([CASSANDRA_HOST],\n", "load_balancing_policy=TokenAwarePolicy(DCAwareRoundRobinPolicy()),\n", "port=CASSANDRA_PORT,\n", "auth_provider=auth_provider,\n", "executor_threads=2,\n", "protocol_version=4,\n", ") " ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "ipykernel_launcher:10: DeprecationWarning: Legacy execution parameters will be removed in 4.0. Consider using execution profiles.\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "id": "tz58KhpFzxsB" }, "source": [ "session = cluster.connect()\n", "register_connection(str(session), session=session)\n", "set_default_connection(str(session))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "m2aSYu5ZzxsC", "outputId": "f603f5ce-e087-4f93-9209-aa4a27d43823" }, "source": [ "rows = session.execute('select * from demo.click_stream;')\n", "df = pd.DataFrame(list(rows))\n", "df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " pk created_at client_ip \\\n", "0 2021-05-01 2021-05-01 15:59:08.188 172.20.0.1 \n", "1 2021-05-01 2021-05-01 15:59:08.256 172.20.0.1 \n", "2 2021-05-01 2021-05-01 15:59:08.265 172.20.0.1 \n", "3 2021-05-01 2021-05-01 15:59:08.270 172.20.0.1 \n", "4 2021-05-01 2021-05-01 15:59:08.280 172.20.0.1 \n", "\n", " cookie_id event_name item_id \\\n", "0 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10009 \n", "1 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10004 \n", "2 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10007 \n", "3 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10002 \n", "4 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10001 \n", "\n", " url \n", "0 http://127.0.0.1:8051/ \n", "1 http://127.0.0.1:8051/ \n", "2 http://127.0.0.1:8051/ \n", "3 http://127.0.0.1:8051/ \n", "4 http://127.0.0.1:8051/ " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pkcreated_atclient_ipcookie_idevent_nameitem_idurl
02021-05-012021-05-01 15:59:08.188172.20.0.10:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkNview10009http://127.0.0.1:8051/
12021-05-012021-05-01 15:59:08.256172.20.0.10:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkNview10004http://127.0.0.1:8051/
22021-05-012021-05-01 15:59:08.265172.20.0.10:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkNview10007http://127.0.0.1:8051/
32021-05-012021-05-01 15:59:08.270172.20.0.10:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkNview10002http://127.0.0.1:8051/
42021-05-012021-05-01 15:59:08.280172.20.0.10:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkNview10001http://127.0.0.1:8051/
\n", "
" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "cell_type": "code", "metadata": { "id": "0i4RM91czxsD", "outputId": "fee58e18-97e9-4a4c-c72f-165fedd8431b" }, "source": [ "df.info()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "\n", "RangeIndex: 265 entries, 0 to 264\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 pk 265 non-null object \n", " 1 created_at 265 non-null datetime64[ns]\n", " 2 client_ip 265 non-null object \n", " 3 cookie_id 265 non-null object \n", " 4 event_name 265 non-null object \n", " 5 item_id 265 non-null object \n", " 6 url 265 non-null object \n", "dtypes: datetime64[ns](1), object(6)\n", "memory usage: 14.6+ KB\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "2ZE4ekoEzxsE", "outputId": "7c32c54d-deb2-48ad-b91d-8fe725d65f01" }, "source": [ "df.describe()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "ipykernel_launcher:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n" ], "name": "stderr" }, { "output_type": "execute_result", "data": { "text/plain": [ " pk created_at client_ip \\\n", "count 265 265 265 \n", "unique 1 265 1 \n", "top 2021-05-01 2021-05-01 18:43:48.776000 172.20.0.1 \n", "freq 265 1 265 \n", "first NaN 2021-05-01 15:59:08.188000 NaN \n", "last NaN 2021-05-01 18:44:17.576000 NaN \n", "\n", " cookie_id event_name item_id \\\n", "count 265 265 265 \n", "unique 5 4 12 \n", "top 0:ko5xo7an:1ORfub~MP9WFwk9lasJHFV70F~lkpcVk view 10002 \n", "freq 160 260 38 \n", "first NaN NaN NaN \n", "last NaN NaN NaN \n", "\n", " url \n", "count 265 \n", "unique 2 \n", "top http://127.0.0.1:8051/ \n", "freq 264 \n", "first NaN \n", "last NaN " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pkcreated_atclient_ipcookie_idevent_nameitem_idurl
count265265265265265265265
unique1265154122
top2021-05-012021-05-01 18:43:48.776000172.20.0.10:ko5xo7an:1ORfub~MP9WFwk9lasJHFV70F~lkpcVkview10002http://127.0.0.1:8051/
freq265126516026038264
firstNaN2021-05-01 15:59:08.188000NaNNaNNaNNaNNaN
lastNaN2021-05-01 18:44:17.576000NaNNaNNaNNaNNaN
\n", "
" ] }, "metadata": { "tags": [] }, "execution_count": 8 } ] }, { "cell_type": "code", "metadata": { "id": "UVfC-exMzxsF", "outputId": "854da3c4-6219-4074-8c15-29d21017fbad" }, "source": [ "df.item_id.value_counts()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "10002 38\n", "10006 34\n", "10003 30\n", "10000 28\n", "10009 27\n", "10001 24\n", "10004 24\n", "10007 20\n", "10005 20\n", "10008 18\n", "1204 1\n", "1231 1\n", "Name: item_id, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "YhMWrOQAzxsG" }, "source": [ "df.to_pickle('../recommender/data/logs_test_020521_1.p')" ], "execution_count": null, "outputs": [] } ] }