{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'3.6.7'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from platform import python_version\n", "\n", "python_version()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "import matplotlib\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('0.25.1', '3.0.3')" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.__version__,matplotlib.__version__" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
0barSed mollis tempor accumsan.2010
1barSed mollis tempor accumsan.2010
2bazNullam et feugiat turpis, non condimentum dolor.2005
3bazAenean eu aliquam nunc.2005
4fooLorem ipsum dolor sit amet.2011
5fooLorem ipsum dolor sit amet.2011
\n", "
" ], "text/plain": [ " title contents year\n", "0 bar Sed mollis tempor accumsan. 2010\n", "1 bar Sed mollis tempor accumsan. 2010\n", "2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n", "3 baz Aenean eu aliquam nunc. 2005\n", "4 foo Lorem ipsum dolor sit amet. 2011\n", "5 foo Lorem ipsum dolor sit amet. 2011" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'title': ['bar','bar','baz','baz','foo','foo'],\n", " 'contents':[\n", " 'Sed mollis tempor accumsan.',\n", " 'Sed mollis tempor accumsan.',\n", " 'Nullam et feugiat turpis, non condimentum dolor.',\n", " 'Aenean eu aliquam nunc.',\n", " 'Lorem ipsum dolor sit amet.',\n", " 'Lorem ipsum dolor sit amet.'\n", " ],\n", " 'year':[2010,2010,2005,2005,2011,2011]\n", "})\n", "\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## show" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
1barSed mollis tempor accumsan.2010
5fooLorem ipsum dolor sit amet.2011
\n", "
" ], "text/plain": [ " title contents year\n", "1 bar Sed mollis tempor accumsan. 2010\n", "5 foo Lorem ipsum dolor sit amet. 2011" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.duplicated()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## show including original" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
0barSed mollis tempor accumsan.2010
1barSed mollis tempor accumsan.2010
4fooLorem ipsum dolor sit amet.2011
5fooLorem ipsum dolor sit amet.2011
\n", "
" ], "text/plain": [ " title contents year\n", "0 bar Sed mollis tempor accumsan. 2010\n", "1 bar Sed mollis tempor accumsan. 2010\n", "4 foo Lorem ipsum dolor sit amet. 2011\n", "5 foo Lorem ipsum dolor sit amet. 2011" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.duplicated(keep=False)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## count" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df[df.duplicated()])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## show, some columns only" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
1barSed mollis tempor accumsan.2010
3bazAenean eu aliquam nunc.2005
5fooLorem ipsum dolor sit amet.2011
\n", "
" ], "text/plain": [ " title contents year\n", "1 bar Sed mollis tempor accumsan. 2010\n", "3 baz Aenean eu aliquam nunc. 2005\n", "5 foo Lorem ipsum dolor sit amet. 2011" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.duplicated(subset=['title','year'])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## drop duplicates, keep original" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
0barSed mollis tempor accumsan.2010
2bazNullam et feugiat turpis, non condimentum dolor.2005
3bazAenean eu aliquam nunc.2005
4fooLorem ipsum dolor sit amet.2011
\n", "
" ], "text/plain": [ " title contents year\n", "0 bar Sed mollis tempor accumsan. 2010\n", "2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n", "3 baz Aenean eu aliquam nunc. 2005\n", "4 foo Lorem ipsum dolor sit amet. 2011" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop_duplicates()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## drop duplicates based on some columns" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
0barSed mollis tempor accumsan.2010
2bazNullam et feugiat turpis, non condimentum dolor.2005
4fooLorem ipsum dolor sit amet.2011
\n", "
" ], "text/plain": [ " title contents year\n", "0 bar Sed mollis tempor accumsan. 2010\n", "2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n", "4 foo Lorem ipsum dolor sit amet. 2011" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop_duplicates(subset=['title','year'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## drop columns that are or have duplicates" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
2bazNullam et feugiat turpis, non condimentum dolor.2005
3bazAenean eu aliquam nunc.2005
\n", "
" ], "text/plain": [ " title contents year\n", "2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n", "3 baz Aenean eu aliquam nunc. 2005" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop_duplicates(keep=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## mark duplicates" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyearis_duplicate
0barSed mollis tempor accumsan.2010False
1barSed mollis tempor accumsan.2010True
2bazNullam et feugiat turpis, non condimentum dolor.2005False
3bazAenean eu aliquam nunc.2005False
4fooLorem ipsum dolor sit amet.2011False
5fooLorem ipsum dolor sit amet.2011True
\n", "
" ], "text/plain": [ " title contents year is_duplicate\n", "0 bar Sed mollis tempor accumsan. 2010 False\n", "1 bar Sed mollis tempor accumsan. 2010 True\n", "2 baz Nullam et feugiat turpis, non condimentum dolor. 2005 False\n", "3 baz Aenean eu aliquam nunc. 2005 False\n", "4 foo Lorem ipsum dolor sit amet. 2011 False\n", "5 foo Lorem ipsum dolor sit amet. 2011 True" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.assign(\n", " is_duplicate=lambda d: d.duplicated()\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## custom keep logic" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
0barSed mollis tempor accumsan.2009
1barSed mollis tempor accumsan.2019
2bazNullam et feugiat turpis, non condimentum dolor.2005
3bazAenean eu aliquam nunc.2005
4fooLorem ipsum dolor sit amet.2015
5fooLorem ipsum dolor sit amet.1995
\n", "
" ], "text/plain": [ " title contents year\n", "0 bar Sed mollis tempor accumsan. 2009\n", "1 bar Sed mollis tempor accumsan. 2019\n", "2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n", "3 baz Aenean eu aliquam nunc. 2005\n", "4 foo Lorem ipsum dolor sit amet. 2015\n", "5 foo Lorem ipsum dolor sit amet. 1995" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'title': ['bar','bar','baz','baz','foo','foo'],\n", " 'contents':[\n", " 'Sed mollis tempor accumsan.',\n", " 'Sed mollis tempor accumsan.',\n", " 'Nullam et feugiat turpis, non condimentum dolor.',\n", " 'Aenean eu aliquam nunc.',\n", " 'Lorem ipsum dolor sit amet.',\n", " 'Lorem ipsum dolor sit amet.'\n", " ],\n", " 'year':[2009,2019,2005,2005,2015,1995]\n", "})\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecontentsyear
1barSed mollis tempor accumsan.2019
3bazAenean eu aliquam nunc.2005
2bazNullam et feugiat turpis, non condimentum dolor.2005
4fooLorem ipsum dolor sit amet.2015
\n", "
" ], "text/plain": [ " title contents year\n", "1 bar Sed mollis tempor accumsan. 2019\n", "3 baz Aenean eu aliquam nunc. 2005\n", "2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n", "4 foo Lorem ipsum dolor sit amet. 2015" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(\n", " ['title','contents','year']\n", ").drop_duplicates(\n", " subset=['title','contents'],keep='last'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.6", "language": "python", "name": "python36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }