{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Process the Wine Review file from [Kaggle](https://www.kaggle.com/zynicide/wine-reviews)\n", "\n", "* Do some very high level exploration of the data\n", "* Filter data down to only include wines from Australia \n", "* Other minor changes to support plotting \n", "\n", "This notebook is a companion to the post at [Practical Business Python](http://pbpython.com/wine_visualization.html)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from bokeh.palettes import viridis" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Download and unpack the file from https://www.kaggle.com/zynicide/wine-reviews/data\n", "df = pd.read_csv('winemag-data-130k-v2.csv', index_col=0)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2taster_nametaster_twitter_handletitlevarietywinery
0ItalyAromas include tropical fruit, broom, brimston...Vulkà Bianco87NaNSicily & SardiniaEtnaNaNKerin O’Keefe@kerinokeefeNicosia 2013 Vulkà Bianco (Etna)White BlendNicosia
1PortugalThis is ripe and fruity, a wine that is smooth...Avidagos8715.0DouroNaNNaNRoger Voss@vossrogerQuinta dos Avidagos 2011 Avidagos Red (Douro)Portuguese RedQuinta dos Avidagos
2USTart and snappy, the flavors of lime flesh and...NaN8714.0OregonWillamette ValleyWillamette ValleyPaul Gregutt@paulgwineRainstorm 2013 Pinot Gris (Willamette Valley)Pinot GrisRainstorm
3USPineapple rind, lemon pith and orange blossom ...Reserve Late Harvest8713.0MichiganLake Michigan ShoreNaNAlexander PeartreeNaNSt. Julian 2013 Reserve Late Harvest Riesling ...RieslingSt. Julian
4USMuch like the regular bottling from 2012, this...Vintner's Reserve Wild Child Block8765.0OregonWillamette ValleyWillamette ValleyPaul Gregutt@paulgwineSweet Cheeks 2012 Vintner's Reserve Wild Child...Pinot NoirSweet Cheeks
\n", "
" ], "text/plain": [ " country description \\\n", "0 Italy Aromas include tropical fruit, broom, brimston... \n", "1 Portugal This is ripe and fruity, a wine that is smooth... \n", "2 US Tart and snappy, the flavors of lime flesh and... \n", "3 US Pineapple rind, lemon pith and orange blossom ... \n", "4 US Much like the regular bottling from 2012, this... \n", "\n", " designation points price province \\\n", "0 Vulkà Bianco 87 NaN Sicily & Sardinia \n", "1 Avidagos 87 15.0 Douro \n", "2 NaN 87 14.0 Oregon \n", "3 Reserve Late Harvest 87 13.0 Michigan \n", "4 Vintner's Reserve Wild Child Block 87 65.0 Oregon \n", "\n", " region_1 region_2 taster_name \\\n", "0 Etna NaN Kerin O’Keefe \n", "1 NaN NaN Roger Voss \n", "2 Willamette Valley Willamette Valley Paul Gregutt \n", "3 Lake Michigan Shore NaN Alexander Peartree \n", "4 Willamette Valley Willamette Valley Paul Gregutt \n", "\n", " taster_twitter_handle title \\\n", "0 @kerinokeefe Nicosia 2013 Vulkà Bianco (Etna) \n", "1 @vossroger Quinta dos Avidagos 2011 Avidagos Red (Douro) \n", "2 @paulgwine  Rainstorm 2013 Pinot Gris (Willamette Valley) \n", "3 NaN St. Julian 2013 Reserve Late Harvest Riesling ... \n", "4 @paulgwine  Sweet Cheeks 2012 Vintner's Reserve Wild Child... \n", "\n", " variety winery \n", "0 White Blend Nicosia \n", "1 Portuguese Red Quinta dos Avidagos \n", "2 Pinot Gris Rainstorm \n", "3 Riesling St. Julian \n", "4 Pinot Noir Sweet Cheeks " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "US 54504\n", "France 22093\n", "Italy 19540\n", "Spain 6645\n", "Portugal 5691\n", "Chile 4472\n", "Argentina 3800\n", "Austria 3345\n", "Australia 2329\n", "Germany 2165\n", "New Zealand 1419\n", "South Africa 1401\n", "Israel 505\n", "Greece 466\n", "Canada 257\n", "Name: country, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's see the top 15 Countries\n", "df.country.value_counts()[:15]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Parse out just the Australian wines\n", "df_Aussie = df[df.country == \"Australia\"].copy()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pointsprice
count2329.0000002294.000000
mean88.58050735.437663
std2.98990049.049458
min80.0000005.000000
25%87.00000015.000000
50%89.00000021.000000
75%91.00000038.000000
max100.000000850.000000
\n", "
" ], "text/plain": [ " points price\n", "count 2329.000000 2294.000000\n", "mean 88.580507 35.437663\n", "std 2.989900 49.049458\n", "min 80.000000 5.000000\n", "25% 87.000000 15.000000\n", "50% 89.000000 21.000000\n", "75% 91.000000 38.000000\n", "max 100.000000 850.000000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's see what the price and point distribution looks like\n", "df_Aussie.describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "country False\n", "description False\n", "designation True\n", "points False\n", "price True\n", "province False\n", "region_1 True\n", "region_2 True\n", "taster_name True\n", "taster_twitter_handle True\n", "title False\n", "variety False\n", "winery False\n", "dtype: bool" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's see where null values might be\n", "df_Aussie.isnull().any()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For this analysis, price and points are going to be most important. Let's see how many null prices there are" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "35" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df_Aussie[df_Aussie.price.isnull()])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# We can drop these or fill with the average. For the sake of this analysis, I'll fill with the mean\n", "df_Aussie['price'].fillna((df_Aussie['price'].mean()), inplace=True)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Double check\n", "len(df_Aussie[df_Aussie.price.isnull()])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# There are a couple of titles that are duplicated and it causes minor problems when plotting. \n", "df_Aussie.drop_duplicates(subset=['title'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "South Australia 1268\n", "Victoria 293\n", "Western Australia 274\n", "Australia Other 230\n", "New South Wales 82\n", "Tasmania 36\n", "Name: province, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# One way we will filter the data is by province. Let's see how many there are\n", "df_Aussie.province.value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Shiraz 583\n", "Chardonnay 378\n", "Cabernet Sauvignon 255\n", "Riesling 142\n", "Pinot Noir 119\n", "Grenache 72\n", "Red Blend 58\n", "Sauvignon Blanc 50\n", "Shiraz-Viognier 38\n", "Viognier 32\n", "Sparkling Blend 31\n", "Shiraz-Cabernet Sauvignon 29\n", "Bordeaux-style Red Blend 28\n", "Rosé 25\n", "Pinot Grigio 25\n", "Name: variety, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# It will be nice to color code the plot by the different varieties\n", "df_Aussie.variety.value_counts()[:15]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Use the viridis pallette and create a column that contains the desired color\n", "varieties = list(df_Aussie.variety.unique())\n", "colors = viridis(len(varieties))\n", "color_map = dict(zip(varieties, colors))\n", "df_Aussie[\"variety_color\"] = df_Aussie[\"variety\"].map(color_map)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2taster_nametaster_twitter_handletitlevarietywineryvariety_color
77AustraliaThis medium-bodied Chardonnay features aromas ...Made With Organic Grapes8618.0South AustraliaSouth AustraliaNaNJoe Czerwinski@JoeCzYalumba 2016 Made With Organic Grapes Chardonn...ChardonnayYalumba#440154
83AustraliaPale copper in hue, this wine exudes passion f...Jester Sangiovese8620.0South AustraliaMcLaren ValeNaNJoe Czerwinski@JoeCzMitolo 2016 Jester Sangiovese Rosé (McLaren Vale)RoséMitolo#450558
123AustraliaThe blend is roughly two-thirds Shiraz and one...Parson's Flat9240.0South AustraliaPadthawayNaNJoe Czerwinski@JoeCzHenry's Drive Vignerons 2006 Parson's Flat Shi...Shiraz-Cabernet SauvignonHenry's Drive Vignerons#460B5E
191AustraliaFrom the little-known region of Padthaway, thi...The Trial of John Montford8730.0South AustraliaPadthawayNaNJoe Czerwinski@JoeCzHenry's Drive Vignerons 2006 The Trial of John...Cabernet SauvignonHenry's Drive Vignerons#471163
232AustraliaLifted cedar and pine notes interspersed with ...Red Belly Black8512.0South AustraliaSouth AustraliaNaNNaNNaNAngove's 2006 Red Belly Black Shiraz (South Au...ShirazAngove's#471669
\n", "
" ], "text/plain": [ " country description \\\n", "77 Australia This medium-bodied Chardonnay features aromas ... \n", "83 Australia Pale copper in hue, this wine exudes passion f... \n", "123 Australia The blend is roughly two-thirds Shiraz and one... \n", "191 Australia From the little-known region of Padthaway, thi... \n", "232 Australia Lifted cedar and pine notes interspersed with ... \n", "\n", " designation points price province \\\n", "77 Made With Organic Grapes 86 18.0 South Australia \n", "83 Jester Sangiovese 86 20.0 South Australia \n", "123 Parson's Flat 92 40.0 South Australia \n", "191 The Trial of John Montford 87 30.0 South Australia \n", "232 Red Belly Black 85 12.0 South Australia \n", "\n", " region_1 region_2 taster_name taster_twitter_handle \\\n", "77 South Australia NaN Joe Czerwinski @JoeCz \n", "83 McLaren Vale NaN Joe Czerwinski @JoeCz \n", "123 Padthaway NaN Joe Czerwinski @JoeCz \n", "191 Padthaway NaN Joe Czerwinski @JoeCz \n", "232 South Australia NaN NaN NaN \n", "\n", " title \\\n", "77 Yalumba 2016 Made With Organic Grapes Chardonn... \n", "83 Mitolo 2016 Jester Sangiovese Rosé (McLaren Vale) \n", "123 Henry's Drive Vignerons 2006 Parson's Flat Shi... \n", "191 Henry's Drive Vignerons 2006 The Trial of John... \n", "232 Angove's 2006 Red Belly Black Shiraz (South Au... \n", "\n", " variety winery variety_color \n", "77 Chardonnay Yalumba #440154 \n", "83 Rosé Mitolo #450558 \n", "123 Shiraz-Cabernet Sauvignon Henry's Drive Vignerons #460B5E \n", "191 Cabernet Sauvignon Henry's Drive Vignerons #471163 \n", "232 Shiraz Angove's #471669 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_Aussie.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Save the filtered and processed file to be used in our bokeh plot\n", "df_Aussie.to_csv(\"Aussie_Wines_Plotting.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }