{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: opencv-python in /scratch/.local/lib/python3.6/site-packages (4.1.1.26)\n",
      "Requirement already satisfied: numpy>=1.11.3 in /cvmfs/sft.cern.ch/lcg/views/LCG_96python3/x86_64-centos7-gcc8-opt/lib/python3.6/site-packages (from opencv-python) (1.16.4)\n",
      "Requirement already satisfied: pytesseract in /scratch/.local/lib/python3.6/site-packages (0.3.0)\n",
      "Requirement already satisfied: Pillow in /cvmfs/sft.cern.ch/lcg/views/LCG_96python3/x86_64-centos7-gcc8-opt/lib/python3.6/site-packages (from pytesseract) (6.0.0)\n",
      "Collecting tqdm\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/dc/88/d3213e2f3492daf09d8b41631ad6899f56db17ce83ea9c8a579902bafe5e/tqdm-4.35.0-py2.py3-none-any.whl (50kB)\n",
      "\u001b[K     |████████████████████████████████| 51kB 7.1MB/s eta 0:00:011\n",
      "\u001b[?25hInstalling collected packages: tqdm\n",
      "Successfully installed tqdm-4.35.0\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "!{sys.executable} -m pip install --user opencv-python\n",
    "!{sys.executable} -m pip install --user pytesseract\n",
    "!{sys.executable} -m pip install --user tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import cv2\n",
    "import math\n",
    "import statistics\n",
    "import os\n",
    "try:\n",
    "    from PIL import Image\n",
    "except ImportError:\n",
    "    import Image\n",
    "import pytesseract\n",
    "from tqdm.auto import tqdm\n",
    "from statistics import mean\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_lines(img):\n",
    "    '''\n",
    "    Find straight lines in an image.\n",
    "    Returns a list of lines.\n",
    "    \n",
    "    These settings have been arrived at after much trial and error.\n",
    "    '''\n",
    "    # Convert to grayscale\n",
    "    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
    "    \n",
    "    # Theshold image (convert to black and white)\n",
    "    retval, th = cv2.threshold(gray,125,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n",
    "    # cv2.imwrite('data/th.jpg',th)\n",
    "    \n",
    "    # Use median blur to get rid of a lot of the text\n",
    "    median = cv2.medianBlur(th, 11)\n",
    "    # cv2.imwrite('data/median.jpg',median)\n",
    "    \n",
    "    # Try to strengthen the remaining lines\n",
    "    kernel = np.ones((5,5),np.uint8)\n",
    "    opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel, iterations=1)\n",
    "    # cv2.imwrite('data/opened.jpg',opened)\n",
    "\n",
    "    # Find the edges of the remaining shapes\n",
    "    edges = cv2.Canny(opened,50,150,apertureSize=3)\n",
    "    # cv2.imwrite('data/edges.jpg',edges)\n",
    "    \n",
    "    # Find straight lines in the edges\n",
    "    # Note that the minLineLength and maxLineGap values can have a dramatic effect on the number of lines detected.\n",
    "    # Note too that what looks to the human eye as a single straight line,\n",
    "    # can actually be a series of short line with tiny gaps between them,\n",
    "    # so while increasing the minLineLength reduces noise from text, it can also filter out columns.\n",
    "    lines = cv2.HoughLinesP(image=edges,rho=1,theta=np.pi/180, threshold=200,lines=np.array([]), minLineLength=100,maxLineGap=100)\n",
    "    return lines\n",
    "\n",
    "\n",
    "def find_header(img):\n",
    "    '''\n",
    "    Locate the page header, and return the y position of the bottom.\n",
    "    Note that in this case using OCR seemed easier than trying to sort/cluster lines.\n",
    "    '''\n",
    "    \n",
    "    # Image dimensions\n",
    "    (h, w) = img.shape[:2]\n",
    "    points = []\n",
    "    \n",
    "    # The header will always be at the top, so crop off the top of the image, rather than OCRing the whole thing\n",
    "    cropped = img[0:round(h/4), 0:round(w-(w/4))]\n",
    "    \n",
    "    # I should probably do some image preprocessing here (see row detection notebook)\n",
    "    # Get OCR results\n",
    "    results = pytesseract.image_to_data(cropped, output_type=pytesseract.Output.DICT)\n",
    "    \n",
    "    # Loop through the OCR results looking for specific words that appear in the header\n",
    "    for index, word in enumerate(results['text']):\n",
    "        # These are the words that appear in the header\n",
    "        # If we find one, add its 'top' value to our list of points\n",
    "        if re.search(r'Shares|Quotations|Buyers|Sellers|Business|Done|Closing prices|Sales', word, flags=re.IGNORECASE) and results['height'][index] < 50:\n",
    "            # y = results['top'][index]\n",
    "            points.append(results['top'][index])\n",
    "    # y = round(mean(points))\n",
    "    # print(points)\n",
    "    \n",
    "    try:\n",
    "        # Sort points and get the first\n",
    "        y = sorted(points)[0]\n",
    "    except IndexError:\n",
    "        # If we can't find a header return 0\n",
    "        y = 0\n",
    "    \n",
    "    # Return y value of header\n",
    "    return y\n",
    "    \n",
    "\n",
    "def check_for_skew(lines):\n",
    "    '''\n",
    "    Check for skewing by looking at the near vertical lines detected in the image.\n",
    "    '''\n",
    "    angles = []\n",
    "    \n",
    "    # Loop through detected lines\n",
    "    for line in lines:\n",
    "        # Get coords of line\n",
    "        for x1,y1,x2,y2 in line:\n",
    "            # Ignore short lines and lines in header\n",
    "            if abs(y1 - y2) > 150 and x1 > 300:\n",
    "                # Get the angle of the line\n",
    "                if y2 > y1:\n",
    "                    radians = math.atan2((y2 - y1), (x2 - x1))\n",
    "                else:\n",
    "                    radians = math.atan2((y1 - y2), (x1 - x2))\n",
    "                degrees = math.degrees(radians)\n",
    "                # print(degrees)\n",
    "                # If it's vertical-ish, save this angle\n",
    "                if degrees >= 80 and degrees <= 100:\n",
    "                    angles.append(degrees)\n",
    "    # Get the media of the saved angles\n",
    "    angle = statistics.median(angles) - 90\n",
    "    # print(angle)\n",
    "    return angle\n",
    "\n",
    "\n",
    "def deskew(img, angle):\n",
    "    '''\n",
    "    Deskew image by rotating it by the supplied angle.\n",
    "    '''\n",
    "    # Get image dimensions\n",
    "    (h, w) = img.shape[:2]\n",
    "    \n",
    "    # Get the centre of the image\n",
    "    center = (w // 2, h // 2)\n",
    "    \n",
    "    # Rotate image by angle\n",
    "    M = cv2.getRotationMatrix2D(center, angle, 1.0)\n",
    "    rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n",
    "    \n",
    "    # Return the rotated image\n",
    "    return rotated\n",
    "\n",
    "\n",
    "def add_grid(img):\n",
    "    '''\n",
    "    Draws a 100 x 100px grid on image.\n",
    "    Can be useful in interpreting column detection results.\n",
    "    '''\n",
    "    h, w = img.shape[:2]\n",
    "    for x in range(0, w, 100):\n",
    "        cv2.line(img,(x,0),(x,h),(255,0,0),1) \n",
    "    for y in range(0, h, 100):\n",
    "        cv2.line(img,(0,y),(w,y),(255,0,0),1)\n",
    "    return img\n",
    "\n",
    "\n",
    "def find_top(lines):\n",
    "    '''\n",
    "    Use horizontal lines near the top of the page to provide an approximation of the header height.\n",
    "    Used to crop page to ignore lines in header area.\n",
    "    More accurate header location is found using Tesseract.\n",
    "    '''\n",
    "    top = 0\n",
    "    y_values = []\n",
    "    for line in lines:\n",
    "        for x1,y1,x2,y2 in line:\n",
    "            bottom = y1 if y1 > y2 else y2\n",
    "            if bottom < 1000:\n",
    "                radians = math.atan2((y1 - y2), (x1 - x2))\n",
    "                degrees = math.degrees(radians)\n",
    "                if degrees > 179 and degrees < 181:\n",
    "                    y_values.append(bottom)\n",
    "    if y_values:\n",
    "        top = sorted(y_values)[-1]\n",
    "    return top           \n",
    "\n",
    "\n",
    "def find_columns(lines, h, w):\n",
    "    '''\n",
    "    Identifies most likely column values from within the set of straight lines in an image.\n",
    "    This could do with some cleaning up, but it's working well at the moment, so I don't really want to fiddle any more.\n",
    "    Note that this does depend on some knowledge of the images to define ranges of expected values.\n",
    "    '''\n",
    "    x_values = []\n",
    "    \n",
    "    # Get the approximate position of the header so we can ignore lines above this\n",
    "    column_top = find_top(lines)\n",
    "\n",
    "    # Find the x values of vertical lines\n",
    "    for line in lines:\n",
    "        for x1,y1,x2,y2 in line:\n",
    "            \n",
    "            # Find the top\n",
    "            top = y1 if y1 < y2 else y2\n",
    "            \n",
    "            # Ignore column lines at the top & bottom of the image\n",
    "            if top > column_top and top < (h - 600):\n",
    "\n",
    "                # Find the leftmost point\n",
    "                first = x1 if x1 < x2 else x2\n",
    "                \n",
    "                # Find the angle of the line\n",
    "                radians = math.atan2((y1 - y2), (x1 - x2))\n",
    "                degrees = abs(math.degrees(radians))\n",
    "                \n",
    "                # If the line is (close to) vertical, we'll save the left-most x value\n",
    "                if degrees >= 89 and degrees <= 91:\n",
    "                    x_values.append(first)\n",
    "    \n",
    "    # Sort the x_values\n",
    "    x_values = sorted(x_values)\n",
    "    \n",
    "    # Cluster together values within the specified distance\n",
    "    clusters = []\n",
    "    start = 0\n",
    "    # Lines less than this distance apart will be clustered\n",
    "    distance = 10\n",
    "    cluster = []\n",
    "    \n",
    "    # Loop through x values\n",
    "    for x in x_values:\n",
    "        \n",
    "        # If the x value is less than the specified distance from the previous point,\n",
    "        # we'll add it to the current cluster\n",
    "        if x < start + distance:\n",
    "            cluster.append(x)\n",
    "        \n",
    "        # If not we'll save the current cluster, and start a new one\n",
    "        else:\n",
    "            if cluster:\n",
    "                # Add the current cluster to the list of clusters\n",
    "                clusters.append(cluster)\n",
    "                \n",
    "            # Start a new cluster at the current point\n",
    "            cluster = [x]\n",
    "        \n",
    "        # Set the current position\n",
    "        start = x\n",
    "        \n",
    "        # Add the last cluster once we've finished the loop\n",
    "        clusters.append(cluster)\n",
    "\n",
    "    # Now we have a list of clustered x values\n",
    "    # We'll compare nearby clusters and keep the ones with the most values (most likely to be columns)\n",
    "    best_clusters = [[0]]\n",
    "    \n",
    "    # Loop through clusters\n",
    "    for cluster in clusters:\n",
    "        \n",
    "        # If the current cluster is within 200px of the previous one\n",
    "        if cluster[0] < best_clusters[-1][-1] + 200:\n",
    "            \n",
    "            # Check to see which cluster contains the most values\n",
    "            # If it's the current one we'll add it to our best clusters\n",
    "            if len(cluster) > len(best_clusters[-1]):\n",
    "                \n",
    "                # Remove the previous cluster from best clusters\n",
    "                best_clusters.pop()\n",
    "                \n",
    "                # Add this one\n",
    "                best_clusters.append(cluster)\n",
    "        \n",
    "        # If this cluster isn't near the previous one, add it to best clusters\n",
    "        else:\n",
    "            best_clusters.append(cluster)\n",
    "    \n",
    "    # Now we have our best candidates for columns in best clusters\n",
    "    # We'll do some further filtering by checking the clusters against our expectations of column positions\n",
    "    # The pixel values used below are based on trial and error with the Stock Exchange images\n",
    "    # Obviously if you were using this on other images you'd want to adjust them accordingly\n",
    "    \n",
    "    columns = []\n",
    "    start = 0\n",
    "    gutter = 0\n",
    "    gap = None\n",
    "    \n",
    "    # Loop through our best clusters\n",
    "    for cluster in best_clusters:\n",
    "        \n",
    "        # If the leftmost point in this cluster is less than 600 then it's the gutter\n",
    "        if cluster and cluster[0] < 600:\n",
    "            \n",
    "            # Set the gutter value to a mean of the clustered points\n",
    "            gutter = mean(cluster)\n",
    "            \n",
    "            # Sometimes the gutter isn't detected, so we'll set a reasonable start position\n",
    "            if gutter < 100:\n",
    "                start = 100\n",
    "            else:\n",
    "                start = gutter\n",
    "        else:\n",
    "            # Checking the gap between this cluster and the previous one\n",
    "            if gap:\n",
    "                this_gap = gap\n",
    "            else:\n",
    "                \n",
    "                # Current gap is the leftmost point of this cluster minus the previous column position\n",
    "                this_gap = cluster[0] - start\n",
    "            \n",
    "            # This range represents approximate min/max column widths\n",
    "            # We'll look for columns at 100 px intervals starting from the max value until we hit the min value\n",
    "            for width in reversed(range(900, 2000, 100)):\n",
    "                \n",
    "                # Try to make sure columns are roughly the same width\n",
    "                if (cluster[0] - start) > (this_gap - 600) and (cluster[0] - start) < (this_gap + 600):\n",
    "                    \n",
    "                    # If cluster falls within expected values, we'll assume it's a column\n",
    "                    if cluster and cluster[0] > (start + width) and cluster[0] < (w - 600) and this_gap < 2600:\n",
    "                        \n",
    "                        # Save mean of clustered values as column\n",
    "                        columns.append(mean(cluster))\n",
    "                        \n",
    "                        # Set the next start value to value of the last point in cluster\n",
    "                        start = cluster[-1]\n",
    "                        gap = this_gap\n",
    "                        \n",
    "                        # Don't look for any more columns in this cluster\n",
    "                        break\n",
    "    #print(gutter)\n",
    "    #print(columns)\n",
    "    return (gutter, columns)\n",
    "\n",
    "\n",
    "def resize(img, h, w):\n",
    "    '''\n",
    "    Resize image to a max width of 5000 px.\n",
    "    '''\n",
    "    # Find the scale to use, based on max width\n",
    "    scale = 5000 / float(w)\n",
    "    \n",
    "    # Resize the image\n",
    "    resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n",
    "    return resized\n",
    "\n",
    "\n",
    "def save_header(img, header, w, image_name, output_dir):\n",
    "    '''\n",
    "    Save the detected header as a separate image.\n",
    "    '''\n",
    "    \n",
    "    # Crop the image using header value\n",
    "    # Numpy slicing - roi = im[y1:y2, x1:x2]\n",
    "    header_img = img[0:header+20, 0:w]\n",
    "    \n",
    "    # Find the scale to use, based on max width\n",
    "    scale = 2000 / float(w)\n",
    "    \n",
    "    # Resize the image\n",
    "    resized = cv2.resize(header_img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n",
    "    \n",
    "    # Save the cropped image\n",
    "    cv2.imwrite('{}/{}-header.jpg'.format(output_dir, image_name[:-4]), resized)\n",
    "\n",
    "    \n",
    "def save_columns(img, columns, header, h, image_name, output_dir):\n",
    "    '''\n",
    "    Save each detected column as a separate image.\n",
    "    Note that the columns list should include the gutter at the beginning and the image width at the end.\n",
    "    '''\n",
    "    \n",
    "    # Where to save the images\n",
    "    col_dir = os.path.join(output_dir, 'columns')\n",
    "    \n",
    "    # Loop through the column values\n",
    "    for index, column in enumerate(columns):\n",
    "        \n",
    "        # Get the value of the next column to use as the width of the cropped column\n",
    "        try:\n",
    "            next_col = columns[index+1]\n",
    "        except IndexError:\n",
    "            \n",
    "            # If there's no next column we've reached the end of the image, so do nothing\n",
    "            pass\n",
    "        else:\n",
    "            \n",
    "            # Add a little to the margins of the image\n",
    "            if column > 20:\n",
    "                this_col = column - 20\n",
    "            else:\n",
    "                this_col = column\n",
    "                \n",
    "            # Crop the image to the dimensions of the column\n",
    "            col_img = img[header-20:h, this_col:next_col]\n",
    "            \n",
    "            # Save the cropped image, using the undex value to denote column order\n",
    "            cv2.imwrite('{}/{}-col-{}.jpg'.format(col_dir, image_name[:-4], index+1), col_img)\n",
    "            \n",
    "            \n",
    "def display_lines(image_name, output_dir, img, lines):\n",
    "    '''\n",
    "    For testing / debugging - shows ALL the detected lines\n",
    "    '''\n",
    "    for line in lines:\n",
    "        #print(line)\n",
    "        x1,y1,x2,y2 = line[0]\n",
    "        cv2.line(img,(x1,y1),(x2,y2),(0,0,255),8)\n",
    "    #cv2.imwrite('{}/{}-lines.jpg'.format(output_dir, image_name[:-4]), img)\n",
    "    \n",
    "\n",
    "def process_image(image_name, image_path, output_dir='test', markup=False, grid=False):\n",
    "    '''\n",
    "    Detect columns and header in the supplied image.\n",
    "    \n",
    "    Parameters:\n",
    "        image_name\n",
    "        image_path\n",
    "        output_dir (must exist)\n",
    "        markup – if True, draw the results on the image, if False crop and save the detected regions.\n",
    "        grid – if True, draw a grid on the image\n",
    "    '''\n",
    "    \n",
    "    img = cv2.imread(image_path)\n",
    "    \n",
    "    # Get image dimensions\n",
    "    try:\n",
    "        h, w = img.shape[:2]\n",
    "        \n",
    "    # Weed out dodgy images\n",
    "    except AttributeError:\n",
    "        print('Not a valid image: {}'.format(image_path))\n",
    "        \n",
    "    # If it looks ok, then proceed...\n",
    "    else:\n",
    "        \n",
    "        # To standardise things a little, we'll resize images with a width greater than 5000\n",
    "        if w > 5000:\n",
    "            img = resize(img, h, w)\n",
    "            \n",
    "            # Get the new dimensions\n",
    "            h, w = img.shape[:2]\n",
    "            \n",
    "        # Detect stratight lines in the image\n",
    "        lines = find_lines(img)\n",
    "        \n",
    "        # Use the detected lines to check for skew\n",
    "        # I'm not actually sure if these deskewing steps are useful\n",
    "        angle = check_for_skew(lines)\n",
    "        \n",
    "        # If image seems to be skewed, then deskew!\n",
    "        if angle != 0:\n",
    "            # print('Deskewing')\n",
    "            img = deskew(img, angle)\n",
    "            \n",
    "            # Once deskewed we have to redo line detection because positions will have changed\n",
    "            lines = find_lines(img)\n",
    "        \n",
    "        #display_lines(image_name, output_dir, img, lines)\n",
    "        \n",
    "        # Filter the detected lines to identify columns\n",
    "        # gutter, columns = find_columns(lines, h, w)\n",
    "        \n",
    "        # Find the bottom of the header\n",
    "        header = find_header(img)\n",
    "        \n",
    "        # Otherwise crop & save columns and header\n",
    "        save_header(img, header, w, image_name, output_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "079c640cfcd14dfa8188f9b4554b7033",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Directories', max=199, style=ProgressStyle(description_width=…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a4d7ead9f3054240b20de955a2b08e1d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=1, bar_style='info', description='Images', max=1, style=ProgressStyle(descrip…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e85fb90228034a03a67a0e5cbe2022e5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=303, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a2ca46905b4f42c8b8ae7ff56e4f3ef5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=312, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2584582ea7e8442bb2bdecb602fd1718",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=345, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "946c7b910f5947128c0eb0ebffa2260f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=312, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ab4e2fc6867846b391984b9c96f02e8a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=305, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b96ca3718b2b4832970af287785c1dc9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=334, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9efd5d19472e42169ebc160c224058f4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=350, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Not a valid image: /scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950/AU NBAC N193-007/.sys.v#.N193-007_0012.tif\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e2863f3bf59f499586c59660eb2dff53",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=318, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d466e85f8b0e4ff5bd720c758e7dedbd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=327, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3e08c973a06247039f8536a1041290df",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=327, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7b01c3e4a8614d9080a6d4a658687b06",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=350, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "187e92e9d1ed4c38a80a363c3006f54f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=310, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e14c199e42f146119a3014bf7dabdc47",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=330, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eff2ccf7c4a74adaa9d1ea246d99eea1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=349, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7858ba74e48e41ac8120a03054e32fb0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=313, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fa8d3088981e476ca24904c7c5255fe6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=331, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "af49a9cbf99c422f80aa3be151befde3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=322, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "755f7978517d44bf9001d731d934cd15",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=348, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4714b8d069554553be3678826d1a02fc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=312, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8598e1e072e64dc18e63ff27766eac87",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=330, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c9aee6603511441fa8b550877418bbf0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=314, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "89452a33a92a402184350e5bef0a8fdd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=344, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7222b5aec86546c5aefe292b48c0fc5c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=310, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6f5b7cf7b83f4cc1a578f3bcbc8a6039",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=323, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8879f9b7213c43bd826d7a6c5ffe7dbb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=332, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "77a0ef16ffa24495a65000cff35af918",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=349, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c10317a7ac154c53a22fb79c17a39e75",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=314, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "335f8f7cbe4a479e99e1a4af8dc544f3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=328, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8037e129ef144944851ae36c8e15b39b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=327, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "357f19d19b9e4db991bf0eba99359d82",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=339, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9145ba2b18844b328ae62c6847c54fa3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=316, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6d235b1054d14525baf1d4fdd1f328d9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=329, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Directory of directories\n",
    "dir_path = '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950'\n",
    "\n",
    "# This is where the processed images should go\n",
    "# output_dir = 'processed/samples'\n",
    "# os.makedirs(os.path.join(output_dir, 'columns'), exist_ok=True)\n",
    "os.makedirs(os.path.join('all_headers'), exist_ok=True)\n",
    "\n",
    "# Loop through directories\n",
    "for img_dir in tqdm([d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d)) and d not in ['AU NBAC N193-014']], desc='Directories'):\n",
    "    output_dir = os.path.join('all_headers', img_dir)\n",
    "    os.makedirs(output_dir, exist_ok=True)\n",
    "    # Loop through images with .tif(f) extension\n",
    "    for img_name in tqdm([i for i in os.listdir(os.path.join(dir_path, img_dir)) if '.tif' in i[-5:].lower()], leave=False, desc='Images'):\n",
    "        # print(img_name)\n",
    "        img_path = os.path.join(dir_path, img_dir, img_name)\n",
    "        output_img = os.path.join(output_dir, '{}-header.jpg'.format(img_name.split('.')[0]))\n",
    "        if not os.path.exists(output_img):\n",
    "            try:\n",
    "                #find_lines(img, img_path)\n",
    "                process_image(img_name, img_path, output_dir, markup=False)\n",
    "            except (TypeError, statistics.StatisticsError):\n",
    "                raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Directory of directories\n",
    "dir_path = '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950'\n",
    "\n",
    "def process_dir(img_dir):\n",
    "    output_dir = os.path.join('all_headers', img_dir)\n",
    "    os.makedirs(output_dir, exist_ok=True)\n",
    "    # Loop through images with .tif(f) extension\n",
    "    for img_name in tqdm([i for i in os.listdir(os.path.join(dir_path, img_dir)) if '.tif' in i[-5:].lower()], leave=False, desc='Images'):\n",
    "        # print(img_name)\n",
    "        img_path = os.path.join(dir_path, img_dir, img_name)\n",
    "        output_img = os.path.join(output_dir, '{}-header.jpg'.format(output_dir, img_name[:-4]))\n",
    "        if not os.path.exists(output_img):\n",
    "            try:\n",
    "                #find_lines(img, img_path)\n",
    "                process_image(img_name, img_path, output_dir, markup=False)\n",
    "            except (TypeError, statistics.StatisticsError):\n",
    "                raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "79ce437cbaec4a90b3565f8c43a6beba",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=390, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ea7c3639f86b4920a1711b7a9d8ba722",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Images', max=605, style=ProgressStyle(description_width='init…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for img_dir in ['AU NBAC N193-135', 'AU NBAC N193-146', 'AU NBAC N193-165', 'AU NBAC N193-191']:\n",
    "    process_dir(img_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}