{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare a CSV file for upload to Zooniverse Project Builder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import glob\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column</th>\n",
       "      <th>image</th>\n",
       "      <th>page</th>\n",
       "      <th>row</th>\n",
       "      <th>volume</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>2</td>\n",
       "      <td>N193-022_0184-col-2-0.jpg</td>\n",
       "      <td>0184</td>\n",
       "      <td>0</td>\n",
       "      <td>N193-022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>2</td>\n",
       "      <td>N193-022_0184-col-2-1.jpg</td>\n",
       "      <td>0184</td>\n",
       "      <td>1</td>\n",
       "      <td>N193-022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>2</td>\n",
       "      <td>N193-022_0184-col-2-2.jpg</td>\n",
       "      <td>0184</td>\n",
       "      <td>2</td>\n",
       "      <td>N193-022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>2</td>\n",
       "      <td>N193-022_0184-col-2-3.jpg</td>\n",
       "      <td>0184</td>\n",
       "      <td>3</td>\n",
       "      <td>N193-022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>2</td>\n",
       "      <td>N193-022_0184-col-2-4.jpg</td>\n",
       "      <td>0184</td>\n",
       "      <td>4</td>\n",
       "      <td>N193-022</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   column                      image  page  row    volume\n",
       "33      2  N193-022_0184-col-2-0.jpg  0184    0  N193-022\n",
       "32      2  N193-022_0184-col-2-1.jpg  0184    1  N193-022\n",
       "39      2  N193-022_0184-col-2-2.jpg  0184    2  N193-022\n",
       "40      2  N193-022_0184-col-2-3.jpg  0184    3  N193-022\n",
       "26      2  N193-022_0184-col-2-4.jpg  0184    4  N193-022"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_pattern = 'data/columns/rows/N193-022_0184-col-*'\n",
    "images = glob.glob(file_pattern)\n",
    "data = []\n",
    "for image in images:\n",
    "    if 'header' not in image:\n",
    "        filename = image.split('/')[-1]\n",
    "        volume = re.search(r'(N193-\\d+)_', filename).group(1)\n",
    "        page = re.search(r'N193-\\d+_(\\d+)-col', filename).group(1)\n",
    "        column = re.search(r'N193-\\d+_\\d+-col-(\\d+)', filename).group(1)\n",
    "        row = re.search(r'N193-\\d+_\\d+-col-\\d+-(\\d+)', filename).group(1)\n",
    "        data.append({'image': filename, 'volume': volume, 'page': page, 'column': column, 'row': row})\n",
    "df = pd.DataFrame(data)\n",
    "df['row'] = pd.to_numeric(df['row'])\n",
    "df = df.sort_values('row')    \n",
    "df.head()   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('sample_rows.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}