{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'3.6.7'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from platform import python_version\n",
"\n",
"python_version()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('0.25.1', '3.0.3')"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__,matplotlib.__version__"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
"
\n",
" \n",
" 1 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
"
\n",
" \n",
" 2 | \n",
" baz | \n",
" Nullam et feugiat turpis, non condimentum dolor. | \n",
" 2005 | \n",
"
\n",
" \n",
" 3 | \n",
" baz | \n",
" Aenean eu aliquam nunc. | \n",
" 2005 | \n",
"
\n",
" \n",
" 4 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
"
\n",
" \n",
" 5 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"0 bar Sed mollis tempor accumsan. 2010\n",
"1 bar Sed mollis tempor accumsan. 2010\n",
"2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n",
"3 baz Aenean eu aliquam nunc. 2005\n",
"4 foo Lorem ipsum dolor sit amet. 2011\n",
"5 foo Lorem ipsum dolor sit amet. 2011"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'title': ['bar','bar','baz','baz','foo','foo'],\n",
" 'contents':[\n",
" 'Sed mollis tempor accumsan.',\n",
" 'Sed mollis tempor accumsan.',\n",
" 'Nullam et feugiat turpis, non condimentum dolor.',\n",
" 'Aenean eu aliquam nunc.',\n",
" 'Lorem ipsum dolor sit amet.',\n",
" 'Lorem ipsum dolor sit amet.'\n",
" ],\n",
" 'year':[2010,2010,2005,2005,2011,2011]\n",
"})\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## show"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
"
\n",
" \n",
" 5 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"1 bar Sed mollis tempor accumsan. 2010\n",
"5 foo Lorem ipsum dolor sit amet. 2011"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.duplicated()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## show including original"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
"
\n",
" \n",
" 1 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
"
\n",
" \n",
" 4 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
"
\n",
" \n",
" 5 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"0 bar Sed mollis tempor accumsan. 2010\n",
"1 bar Sed mollis tempor accumsan. 2010\n",
"4 foo Lorem ipsum dolor sit amet. 2011\n",
"5 foo Lorem ipsum dolor sit amet. 2011"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.duplicated(keep=False)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## count"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df[df.duplicated()])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## show, some columns only"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
"
\n",
" \n",
" 3 | \n",
" baz | \n",
" Aenean eu aliquam nunc. | \n",
" 2005 | \n",
"
\n",
" \n",
" 5 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"1 bar Sed mollis tempor accumsan. 2010\n",
"3 baz Aenean eu aliquam nunc. 2005\n",
"5 foo Lorem ipsum dolor sit amet. 2011"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.duplicated(subset=['title','year'])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## drop duplicates, keep original"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
"
\n",
" \n",
" 2 | \n",
" baz | \n",
" Nullam et feugiat turpis, non condimentum dolor. | \n",
" 2005 | \n",
"
\n",
" \n",
" 3 | \n",
" baz | \n",
" Aenean eu aliquam nunc. | \n",
" 2005 | \n",
"
\n",
" \n",
" 4 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"0 bar Sed mollis tempor accumsan. 2010\n",
"2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n",
"3 baz Aenean eu aliquam nunc. 2005\n",
"4 foo Lorem ipsum dolor sit amet. 2011"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop_duplicates()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## drop duplicates based on some columns"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
"
\n",
" \n",
" 2 | \n",
" baz | \n",
" Nullam et feugiat turpis, non condimentum dolor. | \n",
" 2005 | \n",
"
\n",
" \n",
" 4 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"0 bar Sed mollis tempor accumsan. 2010\n",
"2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n",
"4 foo Lorem ipsum dolor sit amet. 2011"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop_duplicates(subset=['title','year'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## drop columns that are or have duplicates"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" baz | \n",
" Nullam et feugiat turpis, non condimentum dolor. | \n",
" 2005 | \n",
"
\n",
" \n",
" 3 | \n",
" baz | \n",
" Aenean eu aliquam nunc. | \n",
" 2005 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n",
"3 baz Aenean eu aliquam nunc. 2005"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop_duplicates(keep=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## mark duplicates"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
" is_duplicate | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2010 | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" baz | \n",
" Nullam et feugiat turpis, non condimentum dolor. | \n",
" 2005 | \n",
" False | \n",
"
\n",
" \n",
" 3 | \n",
" baz | \n",
" Aenean eu aliquam nunc. | \n",
" 2005 | \n",
" False | \n",
"
\n",
" \n",
" 4 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
" False | \n",
"
\n",
" \n",
" 5 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2011 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year is_duplicate\n",
"0 bar Sed mollis tempor accumsan. 2010 False\n",
"1 bar Sed mollis tempor accumsan. 2010 True\n",
"2 baz Nullam et feugiat turpis, non condimentum dolor. 2005 False\n",
"3 baz Aenean eu aliquam nunc. 2005 False\n",
"4 foo Lorem ipsum dolor sit amet. 2011 False\n",
"5 foo Lorem ipsum dolor sit amet. 2011 True"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.assign(\n",
" is_duplicate=lambda d: d.duplicated()\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## custom keep logic"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2009 | \n",
"
\n",
" \n",
" 1 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2019 | \n",
"
\n",
" \n",
" 2 | \n",
" baz | \n",
" Nullam et feugiat turpis, non condimentum dolor. | \n",
" 2005 | \n",
"
\n",
" \n",
" 3 | \n",
" baz | \n",
" Aenean eu aliquam nunc. | \n",
" 2005 | \n",
"
\n",
" \n",
" 4 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2015 | \n",
"
\n",
" \n",
" 5 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 1995 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"0 bar Sed mollis tempor accumsan. 2009\n",
"1 bar Sed mollis tempor accumsan. 2019\n",
"2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n",
"3 baz Aenean eu aliquam nunc. 2005\n",
"4 foo Lorem ipsum dolor sit amet. 2015\n",
"5 foo Lorem ipsum dolor sit amet. 1995"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'title': ['bar','bar','baz','baz','foo','foo'],\n",
" 'contents':[\n",
" 'Sed mollis tempor accumsan.',\n",
" 'Sed mollis tempor accumsan.',\n",
" 'Nullam et feugiat turpis, non condimentum dolor.',\n",
" 'Aenean eu aliquam nunc.',\n",
" 'Lorem ipsum dolor sit amet.',\n",
" 'Lorem ipsum dolor sit amet.'\n",
" ],\n",
" 'year':[2009,2019,2005,2005,2015,1995]\n",
"})\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" contents | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" bar | \n",
" Sed mollis tempor accumsan. | \n",
" 2019 | \n",
"
\n",
" \n",
" 3 | \n",
" baz | \n",
" Aenean eu aliquam nunc. | \n",
" 2005 | \n",
"
\n",
" \n",
" 2 | \n",
" baz | \n",
" Nullam et feugiat turpis, non condimentum dolor. | \n",
" 2005 | \n",
"
\n",
" \n",
" 4 | \n",
" foo | \n",
" Lorem ipsum dolor sit amet. | \n",
" 2015 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title contents year\n",
"1 bar Sed mollis tempor accumsan. 2019\n",
"3 baz Aenean eu aliquam nunc. 2005\n",
"2 baz Nullam et feugiat turpis, non condimentum dolor. 2005\n",
"4 foo Lorem ipsum dolor sit amet. 2015"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(\n",
" ['title','contents','year']\n",
").drop_duplicates(\n",
" subset=['title','contents'],keep='last'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}