{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## RusAge\n", "Data from RusAge (https://www.kaggle.com/oldaandozerskaya/fiction-corpus-for-agebased-text-classification)\n", "\n", "RusAge: Corpus for Age-Based Text Classification\n", "Russian fiction books' previews with age rating labels." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Columns in the dataset\n", "\n", "- name of the file (books for adults start with 'adults' and children's books start with 'children')\n", "- book title\n", "- author\n", "- age rating according to Russian age rating system\n", "- genres" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('RusAge.csv', sep=';', names=[\"filename\",\"book_title\",\"author\",\"age_rating\",\"genres\"])\n", "df.head() " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[0:50]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['age_rating'].value_counts().plot(kind='bar')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['author'].value_counts().plot(kind='bar', figsize=(25,8))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['author'].value_counts()[df['author'].value_counts() > 20].plot(kind='bar', figsize=(20,8))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['genres'].value_counts().plot(kind='bar', figsize=(15,8))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filt = (df['age_rating']) == 12\n", "genres_count = df.loc[filt, 'genres'].value_counts()\n", "genres_count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## References\n", "\n", "Sherratt, Tim. (2019, November 17). GLAM-Workbench/csv-explorer (Version v0.1.0). Zenodo. http://doi.org/10.5281/zenodo.3544712\n", "\n", "RusAge (https://www.kaggle.com/oldaandozerskaya/fiction-corpus-for-agebased-text-classification)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }