{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Search narratives\n", "\n", "Most NTSB reports include a narrative written by the investigators. This notebook searches them for references to mast bumping." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/palewire/.local/share/virtualenvs/helicopter-accident-analysis-OQ5AjB6w/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", " return f(*args, **kwds)\n" ] } ], "source": [ "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in the narratives." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%store -r input_dir\n", "%store -r output_dir" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "read_df = lambda name: pd.read_csv(os.path.join(output_dir, name))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "narratives = read_df(\"narratives.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Join them to fatal U.S. helicopter accidents." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "helicopter_by_accident = read_df(\"standardized-helicopters-by-accident.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "us_helicopter_by_accident = helicopter_by_accident[helicopter_by_accident.in_usa == True]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "merged = pd.merge(\n", " us_helicopter_by_accident,\n", " narratives,\n", " on=[\"event_id\", \"aircraft_id\"]\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Search them for terms related to mast bumping." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def search(df, string):\n", " \"\"\"\n", " Searches the provided DataFrame's columns for the provided string.\n", " \n", " Returns the filtered result as a new DataFrame.\n", " \"\"\"\n", " result_rows = []\n", " for c in df.dtypes[df.dtypes == 'object'].index:\n", " result_rows.append(df[df[c].str.lower().str.contains(string.lower(), na=False)])\n", " return pd.concat(result_rows).drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "hits = pd.concat([\n", " search(merged, \"mast bumping\"),\n", " search(merged, \"rocking\"),\n", " search(merged, \"vibration\"),\n", " search(merged, \"mast bump \"),\n", "]).drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "88" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(hits)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Output the result." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'hits' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mhits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"event_id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"searched-narratives.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'hits' is not defined" ] } ], "source": [ "hits.sort_values(\"event_id\", ascending=True).to_csv(os.path.join(output_dir, \"searched-narratives.csv\"), encoding=\"utf-8\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }