{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python371jvsc74a57bd0e09bc63938c7c12a1d261af93fef4c49c4bc4ec4b59849b02ca92b0abe12c74c",
"display_name": "Python 3.7.1 64-bit ('env': conda)"
},
"colab": {
"name": "Read data from cassandra.ipynb",
"provenance": []
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "Ezeu9nViz3yv"
},
"source": [
"# Read Cassandra as DataFrame\n",
"> Short guide on how to read data from Cassandra into pandas dataframe format\n",
"\n",
"- toc: true\n",
"- badges: true\n",
"- comments: true\n",
"- categories: [Cassandra]\n",
"- image:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "fYlxGtNWzxrz"
},
"source": [
"import os\n",
"from cassandra.cqlengine.models import Model\n",
"from cassandra.cqlengine import columns\n",
"from datetime import datetime\n",
"import pandas as pd"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "PUfn2tsyzxr4"
},
"source": [
"import os\n",
"from datetime import datetime\n",
"\n",
"from cassandra.cqlengine.management import sync_table\n",
"from cassandra.policies import TokenAwarePolicy\n",
"from cassandra.auth import PlainTextAuthProvider\n",
"from cassandra.cluster import (\n",
" Cluster,\n",
" DCAwareRoundRobinPolicy\n",
")\n",
"from cassandra.cqlengine.connection import (\n",
" register_connection,\n",
" set_default_connection\n",
")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "J_ZJSfPWzxr6"
},
"source": [
"CASSANDRA_USERNAME='cassandra'\n",
"CASSANDRA_PASSWORD='cassandra'\n",
"CASSANDRA_HOST='127.0.0.1'\n",
"CASSANDRA_PORT=9042"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "vVnyNK7Szxr7",
"outputId": "be54579a-11b7-4496-be81-a3de8cb9e543"
},
"source": [
"session = None\n",
"cluster = None\n",
"\n",
"auth_provider = PlainTextAuthProvider(username=CASSANDRA_USERNAME, password=CASSANDRA_PASSWORD)\n",
"cluster = Cluster([CASSANDRA_HOST],\n",
"load_balancing_policy=TokenAwarePolicy(DCAwareRoundRobinPolicy()),\n",
"port=CASSANDRA_PORT,\n",
"auth_provider=auth_provider,\n",
"executor_threads=2,\n",
"protocol_version=4,\n",
") "
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"ipykernel_launcher:10: DeprecationWarning: Legacy execution parameters will be removed in 4.0. Consider using execution profiles.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "tz58KhpFzxsB"
},
"source": [
"session = cluster.connect()\n",
"register_connection(str(session), session=session)\n",
"set_default_connection(str(session))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "m2aSYu5ZzxsC",
"outputId": "f603f5ce-e087-4f93-9209-aa4a27d43823"
},
"source": [
"rows = session.execute('select * from demo.click_stream;')\n",
"df = pd.DataFrame(list(rows))\n",
"df.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" pk created_at client_ip \\\n",
"0 2021-05-01 2021-05-01 15:59:08.188 172.20.0.1 \n",
"1 2021-05-01 2021-05-01 15:59:08.256 172.20.0.1 \n",
"2 2021-05-01 2021-05-01 15:59:08.265 172.20.0.1 \n",
"3 2021-05-01 2021-05-01 15:59:08.270 172.20.0.1 \n",
"4 2021-05-01 2021-05-01 15:59:08.280 172.20.0.1 \n",
"\n",
" cookie_id event_name item_id \\\n",
"0 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10009 \n",
"1 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10004 \n",
"2 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10007 \n",
"3 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10002 \n",
"4 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN view 10001 \n",
"\n",
" url \n",
"0 http://127.0.0.1:8051/ \n",
"1 http://127.0.0.1:8051/ \n",
"2 http://127.0.0.1:8051/ \n",
"3 http://127.0.0.1:8051/ \n",
"4 http://127.0.0.1:8051/ "
],
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pk | \n",
" created_at | \n",
" client_ip | \n",
" cookie_id | \n",
" event_name | \n",
" item_id | \n",
" url | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2021-05-01 | \n",
" 2021-05-01 15:59:08.188 | \n",
" 172.20.0.1 | \n",
" 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN | \n",
" view | \n",
" 10009 | \n",
" http://127.0.0.1:8051/ | \n",
"
\n",
" \n",
" 1 | \n",
" 2021-05-01 | \n",
" 2021-05-01 15:59:08.256 | \n",
" 172.20.0.1 | \n",
" 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN | \n",
" view | \n",
" 10004 | \n",
" http://127.0.0.1:8051/ | \n",
"
\n",
" \n",
" 2 | \n",
" 2021-05-01 | \n",
" 2021-05-01 15:59:08.265 | \n",
" 172.20.0.1 | \n",
" 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN | \n",
" view | \n",
" 10007 | \n",
" http://127.0.0.1:8051/ | \n",
"
\n",
" \n",
" 3 | \n",
" 2021-05-01 | \n",
" 2021-05-01 15:59:08.270 | \n",
" 172.20.0.1 | \n",
" 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN | \n",
" view | \n",
" 10002 | \n",
" http://127.0.0.1:8051/ | \n",
"
\n",
" \n",
" 4 | \n",
" 2021-05-01 | \n",
" 2021-05-01 15:59:08.280 | \n",
" 172.20.0.1 | \n",
" 0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN | \n",
" view | \n",
" 10001 | \n",
" http://127.0.0.1:8051/ | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "0i4RM91czxsD",
"outputId": "fee58e18-97e9-4a4c-c72f-165fedd8431b"
},
"source": [
"df.info()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 265 entries, 0 to 264\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 pk 265 non-null object \n",
" 1 created_at 265 non-null datetime64[ns]\n",
" 2 client_ip 265 non-null object \n",
" 3 cookie_id 265 non-null object \n",
" 4 event_name 265 non-null object \n",
" 5 item_id 265 non-null object \n",
" 6 url 265 non-null object \n",
"dtypes: datetime64[ns](1), object(6)\n",
"memory usage: 14.6+ KB\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "2ZE4ekoEzxsE",
"outputId": "7c32c54d-deb2-48ad-b91d-8fe725d65f01"
},
"source": [
"df.describe()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"ipykernel_launcher:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n"
],
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" pk created_at client_ip \\\n",
"count 265 265 265 \n",
"unique 1 265 1 \n",
"top 2021-05-01 2021-05-01 18:43:48.776000 172.20.0.1 \n",
"freq 265 1 265 \n",
"first NaN 2021-05-01 15:59:08.188000 NaN \n",
"last NaN 2021-05-01 18:44:17.576000 NaN \n",
"\n",
" cookie_id event_name item_id \\\n",
"count 265 265 265 \n",
"unique 5 4 12 \n",
"top 0:ko5xo7an:1ORfub~MP9WFwk9lasJHFV70F~lkpcVk view 10002 \n",
"freq 160 260 38 \n",
"first NaN NaN NaN \n",
"last NaN NaN NaN \n",
"\n",
" url \n",
"count 265 \n",
"unique 2 \n",
"top http://127.0.0.1:8051/ \n",
"freq 264 \n",
"first NaN \n",
"last NaN "
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pk | \n",
" created_at | \n",
" client_ip | \n",
" cookie_id | \n",
" event_name | \n",
" item_id | \n",
" url | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 265 | \n",
" 265 | \n",
" 265 | \n",
" 265 | \n",
" 265 | \n",
" 265 | \n",
" 265 | \n",
"
\n",
" \n",
" unique | \n",
" 1 | \n",
" 265 | \n",
" 1 | \n",
" 5 | \n",
" 4 | \n",
" 12 | \n",
" 2 | \n",
"
\n",
" \n",
" top | \n",
" 2021-05-01 | \n",
" 2021-05-01 18:43:48.776000 | \n",
" 172.20.0.1 | \n",
" 0:ko5xo7an:1ORfub~MP9WFwk9lasJHFV70F~lkpcVk | \n",
" view | \n",
" 10002 | \n",
" http://127.0.0.1:8051/ | \n",
"
\n",
" \n",
" freq | \n",
" 265 | \n",
" 1 | \n",
" 265 | \n",
" 160 | \n",
" 260 | \n",
" 38 | \n",
" 264 | \n",
"
\n",
" \n",
" first | \n",
" NaN | \n",
" 2021-05-01 15:59:08.188000 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" last | \n",
" NaN | \n",
" 2021-05-01 18:44:17.576000 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "UVfC-exMzxsF",
"outputId": "854da3c4-6219-4074-8c15-29d21017fbad"
},
"source": [
"df.item_id.value_counts()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"10002 38\n",
"10006 34\n",
"10003 30\n",
"10000 28\n",
"10009 27\n",
"10001 24\n",
"10004 24\n",
"10007 20\n",
"10005 20\n",
"10008 18\n",
"1204 1\n",
"1231 1\n",
"Name: item_id, dtype: int64"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "YhMWrOQAzxsG"
},
"source": [
"df.to_pickle('../recommender/data/logs_test_020521_1.p')"
],
"execution_count": null,
"outputs": []
}
]
}