{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Немного про кино или интерактивные графики в jupyter notebooks" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.12.6\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from plotly import __version__\n", "from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot\n", "import plotly.graph_objs as go\n", "\n", "import requests\n", "import StringIO\n", "import re\n", "import pandas as pd\n", "import numpy as np\n", "from collections import defaultdict\n", "\n", "print __version__\n", "\n", "init_notebook_mode(connected=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = pd.read_csv('kp_all_movies.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idname_ruskp_ratingmovie_durationkp_rating_countmovie_yearimdb_ratingimdb_rating_countgenrescountriesbudgetcritics_ratingname_eng
010000Иезавель7.411104518.019387.68585.0[драма, мелодрама][США]$1 250 00094.0Jezebel
110004947 ронинов7.660204297.019627.81496.0[боевик, драма, история][Япония]NaNNaNChûshingura
210005Живем один раз7.16886330.019377.43812.0[фильм-нуар, драма, криминал][США]$575 000100.0You Only Live Once
3100053Колдун 27.745113109.020036.5393.0[фэнтези, боевик][Япония]NaNNaNOnmyoji 2
4100096Ильза, тигрица из Сибири4.28685151.019774.91211.0[ужасы][Канада]CAD 250 000NaNIlsa the Tigress of Siberia
\n", "
" ], "text/plain": [ " movie_id name_rus kp_rating movie_duration \\\n", "0 10000 Иезавель 7.411 104 \n", "1 100049 47 ронинов 7.660 204 \n", "2 10005 Живем один раз 7.168 86 \n", "3 100053 Колдун 2 7.745 113 \n", "4 100096 Ильза, тигрица из Сибири 4.286 85 \n", "\n", " kp_rating_count movie_year imdb_rating imdb_rating_count \\\n", "0 518.0 1938 7.6 8585.0 \n", "1 297.0 1962 7.8 1496.0 \n", "2 330.0 1937 7.4 3812.0 \n", "3 109.0 2003 6.5 393.0 \n", "4 151.0 1977 4.9 1211.0 \n", "\n", " genres countries budget critics_rating \\\n", "0 [драма, мелодрама] [США] $1 250 000 94.0 \n", "1 [боевик, драма, история] [Япония] NaN NaN \n", "2 [фильм-нуар, драма, криминал] [США] $575 000 100.0 \n", "3 [фэнтези, боевик] [Япония] NaN NaN \n", "4 [ужасы] [Канада] CAD 250 000 NaN \n", "\n", " name_eng \n", "0 Jezebel \n", "1 Chûshingura \n", "2 You Only Live Once \n", "3 Onmyoji 2 \n", "4 Ilsa the Tigress of Siberia " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Количество фильмов в зависимости от года выпуска" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "count_year_df = df.groupby('movie_year', as_index = False).movie_id.count()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trace = go.Bar(\n", " x = count_year_df.movie_year,\n", " y = count_year_df.movie_id\n", ")\n", "layout = go.Layout(\n", " title='Фильмы на Кинопоиске',\n", ")\n", "\n", "fig = go.Figure(data = [trace], layout = layout)\n", "iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Как менялись оценки в зависимости от года выпуска" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "rating_year_df = df.groupby('movie_year', as_index = False)[['kp_rating', 'imdb_rating']].mean()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trace_kp = go.Scatter(\n", " x = rating_year_df.movie_year,\n", " y = rating_year_df.kp_rating,\n", " mode = 'lines',\n", " name = u'КиноПоиск'\n", ")\n", "trace_imdb = go.Scatter(\n", " x = rating_year_df.movie_year,\n", " y = rating_year_df.imdb_rating,\n", " mode = 'lines',\n", " name = 'IMDb'\n", ")\n", "\n", "layout = go.Layout(\n", " title='Оценки фильмов',\n", ") \n", " \n", "fig = go.Figure(data = [trace_kp, trace_imdb], layout = layout)\n", "iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Есть ли различия в оценках в зависимости от жанра фильма?\n", "В первую очередь нужно распарсить поле `genres` в dataframe." ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df['genres'] = df['genres'].fillna('[]')" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def parse_list(lst_str):\n", " return filter(lambda y: y != '', \n", " map(lambda x: x.strip(), \n", " re.sub(r'[\\[\\]]', '', lst_str).split(',')))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [], "source": [ "genres_data = []\n", "for record in df.to_dict(orient = 'records'):\n", " genres_lst = parse_list(record['genres'])\n", " for genre in genres_lst:\n", " copy = record.copy()\n", " copy['genre'] = genre\n", " copy['weight'] = 1./len(genres_lst)\n", " genres_data.append(copy)\n", " \n", "genres_df = pd.DataFrame.from_dict(genres_data)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
budgetcountriescritics_ratinggenregenresimdb_ratingimdb_rating_countkp_ratingkp_rating_countmovie_durationmovie_idmovie_yearname_engname_rusweight
0$1 250 000[США]94.0драма[драма, мелодрама]7.68585.07.411518.0104100001938JezebelИезавель0.500000
1$1 250 000[США]94.0мелодрама[драма, мелодрама]7.68585.07.411518.0104100001938JezebelИезавель0.500000
2NaN[Япония]NaNбоевик[боевик, драма, история]7.81496.07.660297.02041000491962Chûshingura47 ронинов0.333333
3NaN[Япония]NaNдрама[боевик, драма, история]7.81496.07.660297.02041000491962Chûshingura47 ронинов0.333333
4NaN[Япония]NaNистория[боевик, драма, история]7.81496.07.660297.02041000491962Chûshingura47 ронинов0.333333
\n", "
" ], "text/plain": [ " budget countries critics_rating genre genres \\\n", "0 $1 250 000 [США] 94.0 драма [драма, мелодрама] \n", "1 $1 250 000 [США] 94.0 мелодрама [драма, мелодрама] \n", "2 NaN [Япония] NaN боевик [боевик, драма, история] \n", "3 NaN [Япония] NaN драма [боевик, драма, история] \n", "4 NaN [Япония] NaN история [боевик, драма, история] \n", "\n", " imdb_rating imdb_rating_count kp_rating kp_rating_count movie_duration \\\n", "0 7.6 8585.0 7.411 518.0 104 \n", "1 7.6 8585.0 7.411 518.0 104 \n", "2 7.8 1496.0 7.660 297.0 204 \n", "3 7.8 1496.0 7.660 297.0 204 \n", "4 7.8 1496.0 7.660 297.0 204 \n", "\n", " movie_id movie_year name_eng name_rus weight \n", "0 10000 1938 Jezebel Иезавель 0.500000 \n", "1 10000 1938 Jezebel Иезавель 0.500000 \n", "2 100049 1962 Chûshingura 47 ронинов 0.333333 \n", "3 100049 1962 Chûshingura 47 ронинов 0.333333 \n", "4 100049 1962 Chûshingura 47 ронинов 0.333333 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genres_df.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# сформируем топ-10 жанров по количеству фильмов\n", "top_genres = genres_df.groupby('genre')[['movie_id']].count()\\\n", " .sort_values('movie_id', ascending = False)\\\n", " .head(10).index.values.tolist()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "N = float(len(top_genres))\n", "\n", "# cгенерируем цвета для визуализации\n", "c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]\n", "\n", "data = [{\n", " 'y': genres_df[genres_df.genre == top_genres[i]].kp_rating, \n", " 'type':'box',\n", " 'marker':{'color': c[i]},\n", " 'name': top_genres[i]\n", " } for i in range(len(top_genres))]\n", "\n", "layout = go.Layout(\n", " title='Оценки фильмов',\n", " yaxis = {'title': 'Оценка КиноПоиска'}\n", ") \n", "\n", "fig = go.Figure(data = data, layout = layout)\n", "iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Какие жанры чаще всего присутствуют вместе?" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [], "source": [ "genres_coincidents = {}\n", "\n", "for item in df.genres:\n", " parsed_genres = parse_list(item)\n", " for genre1 in parsed_genres:\n", " if genre1 not in genres_coincidents:\n", " genres_coincidents[genre1] = defaultdict(int)\n", " for genre2 in parsed_genres:\n", " genres_coincidents[genre1][genre2] += 1\n", "\n", "genres_coincidents_df = pd.DataFrame.from_dict(genres_coincidents).fillna(0)\n", "\n", "# отнормируем таблицу на количество фильмов каждого жанра\n", "genres_coincidents_df_norm = genres_coincidents_df\\\n", " .apply(lambda x: x/genres_df.groupby('genre').movie_id.count(), axis = 1)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "heatmap = go.Heatmap(\n", " z = genres_coincidents_df_norm.values,\n", " x = genres_coincidents_df_norm.index.values,\n", " y = genres_coincidents_df_norm.columns\n", ")\n", "layout = go.Layout(\n", " title = 'Связанные жанры'\n", ")\n", "\n", "fig = go.Figure(data = [heatmap], layout = layout)\n", "iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Как менялись оценки в зависимости от года выпуска в различных жанрах" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [], "source": [ "genre_rating_year_df = genres_df.groupby(['movie_year', 'genre'], as_index = False)[['kp_rating', 'imdb_rating']].mean()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "N = len(top_genres)\n", "\n", "data = []\n", "drop_menus = []\n", "\n", "# конструируем все интересующие нас линии\n", "for i in range(N):\n", " genre = top_genres[i]\n", " genre_df = genre_rating_year_df[genre_rating_year_df.genre == genre]\n", " \n", " trace_kp = go.Scatter(\n", " x = genre_df.movie_year,\n", " y = genre_df.kp_rating,\n", " mode = 'lines',\n", " name = genre + ' КиноПоиск',\n", " visible = (i == 0)\n", " )\n", " trace_imdb = go.Scatter(\n", " x = genre_df.movie_year,\n", " y = genre_df.imdb_rating,\n", " mode = 'lines',\n", " name = genre + ' IMDb',\n", " visible = (i == 0)\n", " )\n", " data.append(trace_kp)\n", " data.append(trace_imdb)\n", " \n", "# создаем выпадающие меню\n", "for i in range(N):\n", " drop_menus.append(\n", " dict(\n", " args=['visible', [False]*2*i + [True]*2 + [False]*2*(N-1-i)],\n", " label= top_genres[i],\n", " method='restyle'\n", " )\n", " )\n", "\n", "layout = go.Layout(\n", " title='Фильмы по жанрам',\n", " updatemenus=list([\n", " dict(\n", " x = -0.1,\n", " y = 1,\n", " yanchor = 'top',\n", " buttons = drop_menus\n", " )\n", " ]),\n", ")\n", "\n", "fig = go.Figure(data = data, layout = layout)\n", "iplot(fig)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }