{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Find columns and headers\n",
    "\n",
    "Process Stock Exchange images, detecting the positions of columns and headers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import cv2\n",
    "import math\n",
    "import statistics\n",
    "import os\n",
    "try:\n",
    "    from PIL import Image\n",
    "except ImportError:\n",
    "    import Image\n",
    "import pytesseract\n",
    "from tqdm.auto import tqdm\n",
    "from statistics import mean\n",
    "import re\n",
    "from fuzzywuzzy import fuzz\n",
    "import tempfile\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# These OCR image preprocessing steps are based on https://stackoverflow.com/a/43493383\n",
    "# I don't really understand why this particular combination of filters works, but it does seem to improve OCR results\n",
    "\n",
    "BINARY_THRESHOLD = 200\n",
    "\n",
    "def process_image_for_ocr(file_path):\n",
    "    # TODO : Implement using opencv\n",
    "    temp_filename = set_image_dpi(file_path)\n",
    "    im_new = remove_noise_and_smooth(temp_filename)\n",
    "    return im_new\n",
    "\n",
    "\n",
    "def set_image_dpi(file_path):\n",
    "    im = Image.open(file_path)\n",
    "    length_x, width_y = im.size\n",
    "    factor = max(1, int(5000 / length_x))\n",
    "    size = factor * length_x, factor * width_y\n",
    "    # size = (1800, 1800)\n",
    "    im_resized = im.resize(size, Image.ANTIALIAS)\n",
    "    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')\n",
    "    temp_filename = temp_file.name\n",
    "    im.save(temp_filename, dpi=(300, 300))\n",
    "    return temp_filename\n",
    "\n",
    "\n",
    "def image_smoothening(img):\n",
    "    ret1, th1 = cv2.threshold(img, BINARY_THRESHOLD, 255, cv2.THRESH_BINARY)\n",
    "    ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
    "    blur = cv2.GaussianBlur(th2, (1, 1), 0)\n",
    "    ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
    "    return th3\n",
    "\n",
    "\n",
    "def remove_noise_and_smooth(file_name):\n",
    "    img = cv2.imread(file_name, 0)\n",
    "    # gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
    "    filtered = cv2.adaptiveThreshold(img.astype(np.uint8), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 41, 3)\n",
    "    kernel = np.ones((1, 1), np.uint8)\n",
    "    opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)\n",
    "    closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)\n",
    "    img = image_smoothening(img)\n",
    "    or_image = cv2.bitwise_or(img, closing)\n",
    "    (h, w) = or_image.shape[:2]\n",
    "    img = resize(or_image, h, w)\n",
    "    return img"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_lines(img):\n",
    "    '''\n",
    "    Find straight lines in an image.\n",
    "    Returns a list of lines.\n",
    "    \n",
    "    These settings have been arrived at after much trial and error.\n",
    "    '''\n",
    "    # Convert to grayscale\n",
    "    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
    "    \n",
    "    # Theshold image (convert to black and white)\n",
    "    retval, th = cv2.threshold(gray,125,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n",
    "    # cv2.imwrite('data/th.jpg',th)\n",
    "    \n",
    "    # Use median blur to get rid of a lot of the text\n",
    "    median = cv2.medianBlur(th, 11)\n",
    "    # cv2.imwrite('data/median.jpg',median)\n",
    "    \n",
    "    # Try to strengthen the remaining lines\n",
    "    kernel = np.ones((5,5),np.uint8)\n",
    "    opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel, iterations=1)\n",
    "    # cv2.imwrite('data/opened.jpg',opened)\n",
    "\n",
    "    # Find the edges of the remaining shapes\n",
    "    v = np.median(gray)\n",
    "    lower = int(max(0, (1.0 - 0.33) * v))\n",
    "    upper = int(min(255, (1.0 + 0.33) * v))\n",
    "    # edges = cv2.Canny(opened,50,150,apertureSize=3)\n",
    "    edges = cv2.Canny(opened,lower,upper,apertureSize=3)\n",
    "    # cv2.imwrite('data/edges.jpg',edges)\n",
    "    \n",
    "    # Find straight lines in the edges\n",
    "    # Note that the minLineLength and maxLineGap values can have a dramatic effect on the number of lines detected.\n",
    "    # Note too that what looks to the human eye as a single straight line,\n",
    "    # can actually be a series of short line with tiny gaps between them,\n",
    "    # so while increasing the minLineLength reduces noise from text, it can also filter out columns.\n",
    "    lines = cv2.HoughLinesP(image=edges,rho=1,theta=np.pi/180, threshold=200,lines=np.array([]), minLineLength=100,maxLineGap=100)\n",
    "    # print(lines)\n",
    "    return lines\n",
    "\n",
    "def find_margin(df):\n",
    "    return int(round(df.loc[(df['level'] == 4) & (df['left'] < 150)]['left'].mean()))\n",
    "\n",
    "def find_col_width(df):\n",
    "    candidates = []\n",
    "    for confidence in reversed(range(80, 110, 10)):\n",
    "        for heading in ['buyers', 'closing', 'quotations']:\n",
    "            for word in df.loc[(df['level'] == 5) & (df['left'] < 1500)].sort_values(by='top').itertuples():\n",
    "                # print(word.text.lower())\n",
    "                if len(str(word.text)) > 5 and fuzz.partial_ratio(heading, word.text.lower()) >= confidence:\n",
    "                    # print(word)\n",
    "                    # print(fuzz.ratio('buyers', word.text.lower()))\n",
    "                    if word.left > 625:\n",
    "                        candidates.append(word.left)\n",
    "    return candidates\n",
    "\n",
    "def find_header_height(df):\n",
    "    candidates = []\n",
    "    for confidence in reversed(range(80, 110, 10)):\n",
    "        for heading in ['shares', 'closing', 'sales', 'quotations', 'buyers', 'sellers', 'business']:\n",
    "            for word in df.loc[(df['level'] == 5) & (df['top'] < 1750) & (df['left'] < 3750) & (df['height'] < 90)].sort_values(by=['top', 'left']).itertuples():\n",
    "                if len(str(word.text)) > 5 and fuzz.partial_ratio(heading, word.text.lower()) >= confidence:\n",
    "                    # print(word)\n",
    "                    # print(fuzz.partial_ratio(heading, word.text.lower()))\n",
    "                    #return word.top\n",
    "                    candidates.append(word.top)\n",
    "    return candidates\n",
    "    \n",
    "def find_header(img_path):\n",
    "    # Image dimensions\n",
    "    img = process_image_for_ocr(img_path)\n",
    "    (h, w) = img.shape[:2]\n",
    "    points = []\n",
    "    # The header will always be at the top, so crop off the top of the image, rather than OCRing the whole thing\n",
    "    cropped = img[0:1750, 0:w]\n",
    "    col_widths = []\n",
    "    header_heights = []\n",
    "    # The psm settings can greatly effect the results, but they're unpredictable\n",
    "    # Sometimes one setting works better than the other, I don't know why\n",
    "    # So we're going to try them both and look for the best result.\n",
    "    for psm in [4, 6]:\n",
    "        df = pytesseract.image_to_data(cropped, config=f'--psm {psm} --oem 1 -l eng', output_type=pytesseract.Output.DATAFRAME)\n",
    "        col_widths += find_col_width(df)\n",
    "        # print(df.loc[(df['level'] == 5) & (df['left'] < 3750)].sort_values(by=['top', 'left']).to_dict('records'))\n",
    "        header_heights += find_header_height(df)\n",
    "    # margin = find_margin(df)\n",
    "    try:\n",
    "        # header_height = sorted(header_heights)[0]\n",
    "        header_height = int(statistics.median(header_heights))\n",
    "    except (IndexError, statistics.StatisticsError):\n",
    "        header_height = 0\n",
    "    try:\n",
    "        # col_width = sorted(col_widths)[0] - 10\n",
    "        col_width = int(statistics.median(col_widths))\n",
    "    except (IndexError, statistics.StatisticsError):\n",
    "        col_width = 0\n",
    "    # print(col_width, header_height)\n",
    "    return (col_width, header_height)\n",
    "\n",
    "\n",
    "def check_for_skew(lines):\n",
    "    '''\n",
    "    Check for skewing by looking at the near vertical lines detected in the image.\n",
    "    '''\n",
    "    angles = []\n",
    "    \n",
    "    # Loop through detected lines\n",
    "    for line in lines:\n",
    "        # Get coords of line\n",
    "        for x1,y1,x2,y2 in line:\n",
    "            # Ignore short lines and lines in header\n",
    "            if abs(y1 - y2) > 150 and x1 > 300:\n",
    "                # Get the angle of the line\n",
    "                if y2 > y1:\n",
    "                    radians = math.atan2((y2 - y1), (x2 - x1))\n",
    "                else:\n",
    "                    radians = math.atan2((y1 - y2), (x1 - x2))\n",
    "                degrees = math.degrees(radians)\n",
    "                # print(degrees)\n",
    "                # If it's vertical-ish, save this angle\n",
    "                if degrees >= 80 and degrees <= 100:\n",
    "                    angles.append(degrees)\n",
    "    # Get the media of the saved angles\n",
    "    angle = statistics.median(angles) - 90\n",
    "    # print(angle)\n",
    "    return angle\n",
    "\n",
    "\n",
    "def deskew(img, angle):\n",
    "    '''\n",
    "    Deskew image by rotating it by the supplied angle.\n",
    "    '''\n",
    "    # Get image dimensions\n",
    "    (h, w) = img.shape[:2]\n",
    "    \n",
    "    # Get the centre of the image\n",
    "    center = (w // 2, h // 2)\n",
    "    \n",
    "    # Rotate image by angle\n",
    "    M = cv2.getRotationMatrix2D(center, angle, 1.0)\n",
    "    rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n",
    "    \n",
    "    # Return the rotated image\n",
    "    return rotated\n",
    "\n",
    "\n",
    "def add_grid(img):\n",
    "    '''\n",
    "    Draws a 100 x 100px grid on image.\n",
    "    Can be useful in interpreting column detection results.\n",
    "    '''\n",
    "    h, w = img.shape[:2]\n",
    "    for x in range(0, w, 100):\n",
    "        cv2.line(img,(x,0),(x,h),(255,0,0),1) \n",
    "    for y in range(0, h, 100):\n",
    "        cv2.line(img,(0,y),(w,y),(255,0,0),1)\n",
    "    return img\n",
    "\n",
    "\n",
    "def find_top(lines):\n",
    "    '''\n",
    "    Use horizontal lines near the top of the page to provide an approximation of the header height.\n",
    "    Used to crop page to ignore lines in header area.\n",
    "    More accurate header location is found using Tesseract.\n",
    "    '''\n",
    "    top = 0\n",
    "    y_values = []\n",
    "    for line in lines:\n",
    "        for x1,y1,x2,y2 in line:\n",
    "            bottom = y1 if y1 > y2 else y2\n",
    "            if bottom < 1000:\n",
    "                radians = math.atan2((y1 - y2), (x1 - x2))\n",
    "                degrees = math.degrees(radians)\n",
    "                if degrees > 179 and degrees < 181:\n",
    "                    y_values.append(bottom)\n",
    "    if y_values:\n",
    "        top = sorted(y_values)[-1]\n",
    "    return top           \n",
    "\n",
    "\n",
    "def find_columns(lines, h, w, column_top, col_width):\n",
    "    '''\n",
    "    Identifies most likely column values from within the set of straight lines in an image.\n",
    "    This could do with some cleaning up, but it's working well at the moment, so I don't really want to fiddle any more.\n",
    "    Note that this does depend on some knowledge of the images to define ranges of expected values.\n",
    "    '''\n",
    "    x_values = []\n",
    "    \n",
    "    # Get the approximate position of the header so we can ignore lines above this\n",
    "    column_top = find_top(lines)\n",
    "\n",
    "    # Find the x values of vertical lines\n",
    "    for line in lines:\n",
    "        for x1,y1,x2,y2 in line:\n",
    "            \n",
    "            # Find the top\n",
    "            top = y1 if y1 < y2 else y2\n",
    "            \n",
    "            # Ignore column lines at the top & bottom of the image\n",
    "            if top > column_top and top < (h - 600):\n",
    "\n",
    "                # Find the leftmost point\n",
    "                first = x1 if x1 < x2 else x2\n",
    "                \n",
    "                # Find the angle of the line\n",
    "                radians = math.atan2((y1 - y2), (x1 - x2))\n",
    "                degrees = abs(math.degrees(radians))\n",
    "                \n",
    "                # If the line is (close to) vertical, we'll save the left-most x value\n",
    "                if degrees >= 89 and degrees <= 91:\n",
    "                    x_values.append(first)\n",
    "    \n",
    "    # Sort the x_values\n",
    "    x_values = sorted(x_values)\n",
    "    \n",
    "    # Cluster together values within the specified distance\n",
    "    clusters = []\n",
    "    start = 0\n",
    "    # Lines less than this distance apart will be clustered\n",
    "    distance = 10\n",
    "    cluster = []\n",
    "    \n",
    "    # Loop through x values\n",
    "    for x in x_values:\n",
    "        \n",
    "        # If the x value is less than the specified distance from the previous point,\n",
    "        # we'll add it to the current cluster\n",
    "        if x < start + distance:\n",
    "            cluster.append(x)\n",
    "        \n",
    "        # If not we'll save the current cluster, and start a new one\n",
    "        else:\n",
    "            if cluster:\n",
    "                # Add the current cluster to the list of clusters\n",
    "                clusters.append(cluster)\n",
    "                \n",
    "            # Start a new cluster at the current point\n",
    "            cluster = [x]\n",
    "        \n",
    "        # Set the current position\n",
    "        start = x\n",
    "        \n",
    "        # Add the last cluster once we've finished the loop\n",
    "        clusters.append(cluster)\n",
    "\n",
    "    # Now we have a list of clustered x values\n",
    "    # We'll compare nearby clusters and keep the ones with the most values (most likely to be columns)\n",
    "    best_clusters = [[0]]\n",
    "    \n",
    "    # Loop through clusters\n",
    "    for cluster in clusters:\n",
    "        \n",
    "        # If the current cluster is within 200px of the previous one\n",
    "        if cluster[0] < best_clusters[-1][-1] + 200:\n",
    "            \n",
    "            # Check to see which cluster contains the most values\n",
    "            # If it's the current one we'll add it to our best clusters\n",
    "            if len(cluster) > len(best_clusters[-1]):\n",
    "                \n",
    "                # Remove the previous cluster from best clusters\n",
    "                best_clusters.pop()\n",
    "                \n",
    "                # Add this one\n",
    "                best_clusters.append(cluster)\n",
    "        \n",
    "        # If this cluster isn't near the previous one, add it to best clusters\n",
    "        else:\n",
    "            best_clusters.append(cluster)\n",
    "            \n",
    "    # print(best_clusters)\n",
    "    \n",
    "    # Now we have our best candidates for columns in best clusters\n",
    "    # We'll do some further filtering by checking the clusters against our expectations of column positions\n",
    "    # The pixel values used below are based on trial and error with the Stock Exchange images\n",
    "    # Obviously if you were using this on other images you'd want to adjust them accordingly\n",
    "    \n",
    "    columns = []\n",
    "    start = 0\n",
    "    gutter = 0\n",
    "    gap = None\n",
    "    max_col_width = 2000\n",
    "    \n",
    "    # Loop through our best clusters\n",
    "    for cluster in best_clusters:\n",
    "        min_col_width = 950\n",
    "        # If the leftmost point in this cluster is less than 600 then it's the gutter\n",
    "        if cluster and cluster[0] < 600:\n",
    "            \n",
    "            # Set the gutter value to a mean of the clustered points\n",
    "            gutter = mean(cluster)\n",
    "            \n",
    "            # Sometimes the gutter isn't detected, so we'll set a reasonable start position\n",
    "            if gutter == 0:\n",
    "                gutter = 200\n",
    "            if gutter <= 200:\n",
    "                start = 250\n",
    "            else:\n",
    "                start = gutter + 50\n",
    "            if col_width:\n",
    "                # print(col_width)\n",
    "                #min_col_width = ((col_width - start) * 2) - 180\n",
    "                #max_col_width = ((col_width - start) * 2) + 180\n",
    "                min_col_width = max(min_col_width, int(round((col_width - start) * 1.65)))\n",
    "                max_col_width = int(round((col_width - start) * 2.35))\n",
    "                # print(min_col_width)\n",
    "                # print(list(reversed(range(min_col_width, max_col_width + 100, 100))))\n",
    "        else:\n",
    "            # Checking the gap between this cluster and the previous one\n",
    "            if gap:\n",
    "                this_gap = gap\n",
    "            else:\n",
    "                \n",
    "                # Current gap is the leftmost point of this cluster minus the previous column position\n",
    "                this_gap = cluster[0] - start\n",
    "            \n",
    "            # This range represents approximate min/max column widths\n",
    "            # We'll look for columns at 100 px intervals starting from the max value until we hit the min value\n",
    "            for width in reversed(range(min_col_width, max_col_width + 100, 100)):\n",
    "                cluster_mean = mean(cluster)\n",
    "                # print(width)\n",
    "                # print(cluster_mean)\n",
    "                # print(start)\n",
    "                # print(this_gap)\n",
    "                # print('----')\n",
    "                # Try to make sure columns are roughly the same width\n",
    "                if (cluster_mean - start) > (this_gap - 500) and (cluster_mean - start) < (this_gap + 500):\n",
    "        \n",
    "                    # If cluster falls within expected values, we'll assume it's a column\n",
    "                    if cluster and cluster_mean >= (start + width) and cluster_mean <= (w - 900) and this_gap < 2600:\n",
    "\n",
    "                        # Save mean of clustered values as column\n",
    "                        columns.append(cluster_mean)\n",
    "                        \n",
    "                        # Set the next start value to value of the last point in cluster\n",
    "                        start = cluster_mean\n",
    "                        gap = this_gap\n",
    "                        \n",
    "                        # Don't look for any more columns in this cluster\n",
    "                        break\n",
    "    columns.append(w)\n",
    "    return (gutter, columns)\n",
    "\n",
    "\n",
    "def resize(img, h, w):\n",
    "    '''\n",
    "    Resize image to a max width of 5000 px.\n",
    "    '''\n",
    "    # Find the scale to use, based on max width\n",
    "    scale = 5000 / float(w)\n",
    "    \n",
    "    # Resize the image\n",
    "    resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n",
    "    return resized\n",
    "\n",
    "\n",
    "def save_header(img, header, w, image_name, output_dir):\n",
    "    '''\n",
    "    Save the detected header as a separate image.\n",
    "    '''\n",
    "    # Where to save the image\n",
    "    header_dir = os.path.join(output_dir, 'headers')\n",
    "    \n",
    "    # Crop the image using header value\n",
    "    # Numpy slicing - roi = im[y1:y2, x1:x2]\n",
    "    header_img = img[0:header+20, 0:w]\n",
    "    \n",
    "    # Save the cropped image\n",
    "    cv2.imwrite('{}/{}-header.jpg'.format(header_dir, image_name[:-4]), header_img)\n",
    "\n",
    "    \n",
    "def save_columns(img, columns, header, h, image_name, output_dir):\n",
    "    '''\n",
    "    Save each detected column as a separate image.\n",
    "    Note that the columns list should include the gutter at the beginning and the image width at the end.\n",
    "    '''\n",
    "    \n",
    "    # Where to save the images\n",
    "    col_dir = os.path.join(output_dir, 'columns')\n",
    "    # Loop through the column values\n",
    "    for index, column in enumerate(columns):\n",
    "        \n",
    "        # Get the value of the next column to use as the width of the cropped column\n",
    "        try:\n",
    "            next_col = columns[index+1]\n",
    "        except IndexError:\n",
    "            \n",
    "            # If there's no next column we've reached the end of the image, so do nothing\n",
    "            pass\n",
    "        else:\n",
    "            \n",
    "            # Add a little to the margins of the image\n",
    "            if column > 20:\n",
    "                this_col = column - 20\n",
    "            else:\n",
    "                this_col = column\n",
    "                \n",
    "            # Crop the image to the dimensions of the column\n",
    "            col_img = img[max(0, header-20):h, this_col:next_col]\n",
    "            \n",
    "            # Save the cropped image, using the undex value to denote column order\n",
    "            cv2.imwrite('{}/{}-col-{}.jpg'.format(col_dir, image_name[:-4], index+1), col_img)\n",
    "            \n",
    "            \n",
    "def display_lines(image_name, output_dir, img, lines):\n",
    "    '''\n",
    "    For testing / debugging - shows ALL the detected lines\n",
    "    '''\n",
    "    for line in lines:\n",
    "        #print(line)\n",
    "        x1,y1,x2,y2 = line[0]\n",
    "        cv2.line(img,(x1,y1),(x2,y2),(0,0,255),8)\n",
    "    #cv2.imwrite('{}/{}-lines.jpg'.format(output_dir, image_name[:-4]), img)\n",
    "    \n",
    "\n",
    "def process_image(image_name, image_path, output_dir='test', markup=False, grid=False, save_derivs=True):\n",
    "    '''\n",
    "    Detect columns and header in the supplied image.\n",
    "    \n",
    "    Parameters:\n",
    "        image_name\n",
    "        image_path\n",
    "        output_dir (must exist)\n",
    "        markup – if True, draw the results on the image, if False crop and save the detected regions.\n",
    "        grid – if True, draw a grid on the image\n",
    "    '''\n",
    "    img = cv2.imread(image_path)\n",
    "    \n",
    "    # Get image dimensions\n",
    "    try:\n",
    "        h, w = img.shape[:2]\n",
    "        \n",
    "    # Weed out dodgy images\n",
    "    except AttributeError:\n",
    "        print('Not a valid image: {}'.format(image_path))\n",
    "        \n",
    "    # If it looks ok, then proceed...\n",
    "    else:\n",
    "        \n",
    "        # To standardise things a little, we'll resize images with a width greater than 5000\n",
    "        if w > 5000:\n",
    "            img = resize(img, h, w)\n",
    "            \n",
    "            # Get the new dimensions\n",
    "            h, w = img.shape[:2]\n",
    "            \n",
    "        # Detect stratight lines in the image\n",
    "        lines = find_lines(img)\n",
    "        \n",
    "        # Use the detected lines to check for skew\n",
    "        # I'm not actually sure if these deskewing steps are useful\n",
    "        angle = check_for_skew(lines)\n",
    "        \n",
    "        # If image seems to be skewed, then deskew!\n",
    "        if angle != 0:\n",
    "            # print('Deskewing')\n",
    "            img = deskew(img, angle)\n",
    "            \n",
    "            # Once deskewed we have to redo line detection because positions will have changed\n",
    "            lines = find_lines(img)\n",
    "        \n",
    "        #display_lines(image_name, output_dir, img, lines)\n",
    "        \n",
    "        # Find the bottom of the header\n",
    "        col_width, header = find_header(image_path)\n",
    "        # print(col_width, header)\n",
    "        \n",
    "        # Filter the detected lines to identify columns\n",
    "        gutter, columns = find_columns(lines, h, w, header, col_width)\n",
    "        \n",
    "        # Draw a grid on image (for debugging)\n",
    "        if grid:\n",
    "            img = add_grid(img)\n",
    "            \n",
    "        # Crop & save columns and header\n",
    "        if save_derivs:\n",
    "            # Crop and save header\n",
    "            # save_header(img, header, w, image_name, output_dir)\n",
    "            \n",
    "            # Add gutter and page width to the columns list\n",
    "            columns = [gutter] + columns\n",
    "            \n",
    "            # Crop and save columns\n",
    "            save_columns(img, columns, header, h, image_name, output_dir)\n",
    "            \n",
    "        # Draw detected column & header lines on image and save the results (for testing)\n",
    "        if markup:\n",
    "            \n",
    "            # Draw gutter \n",
    "            cv2.line(img,(gutter,0),(gutter,h),(0,255,0),5)\n",
    "            \n",
    "            # Draw columns\n",
    "            for column in columns:\n",
    "                cv2.line(img,(column,0),(column,h),(0,255,0),5)\n",
    "                \n",
    "            # Draw header\n",
    "            cv2.line(img,(0, header),(w, header),(255,0,0),3)\n",
    "            \n",
    "            # Save the annotated image\n",
    "            cv2.imwrite('{}/{}.jpg'.format(output_dir, image_name[:-4]), img)\n",
    "            "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process a single image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "process_image('testing.jpg', '/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-077/N193-077_0001.tif', 'testing', markup=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "find_header('/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-001/N193-001_0006.tif')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process a directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Directory to process\n",
    "dir_path = '/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-001'\n",
    "\n",
    "# This is where the processed images should go\n",
    "output_dir = 'fullsize-processed/AU NBAC N193-001'\n",
    "os.makedirs(os.path.join(output_dir, 'columns'), exist_ok=True)\n",
    "os.makedirs(os.path.join(output_dir, 'headers'), exist_ok=True)\n",
    "\n",
    "# Loop through images with .tif(f) extension\n",
    "for img_name in tqdm([i for i in os.listdir(dir_path) if '.tif' in i[-5:].lower()]):\n",
    "    # print(img_name)\n",
    "    img_path = os.path.join(dir_path, img_name)\n",
    "    try:\n",
    "        process_image(img_name, img_path, output_dir, markup=True)\n",
    "    except (TypeError, statistics.StatisticsError):\n",
    "        print('ERROR')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process lots of directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_vol = 188\n",
    "# Directory of directories\n",
    "input_path = Path('/Volumes/Sydney Stock Exchange Vol 2/Sydney Stock Exchange Vol 2 110-199/')\n",
    "\n",
    "# This is where the processed images should go\n",
    "# output_path = Path('/Volumes/bigdata/mydata/stockexchange/processed')\n",
    "# output_path = 'fulltext-processed'\n",
    "# os.makedirs(os.path.join(output_dir, 'columns'), exist_ok=True)\n",
    "# os.makedirs(os.path.join(output_dir, 'headers'), exist_ok=True)\n",
    "\n",
    "# Loop through directories\n",
    "for img_dir in tqdm([d for d in input_path.glob('*') if d.is_dir()], desc='Directories'):\n",
    "    # print(img_dir)\n",
    "    vol_num = int(re.search(r'(\\d+)$', str(img_dir)).group(1))\n",
    "    if vol_num >= start_vol:\n",
    "        output_path = Path('/Volumes/bigdata/mydata/stockexchange/processed', img_dir.name.replace('Transferred ', ''))\n",
    "        Path(output_path, 'columns').mkdir(parents=True, exist_ok=True)\n",
    "        # Loop through images with .tif(f) extension\n",
    "        for img_path in tqdm([i for i in img_dir.glob('*') if '.tif' in i.name[-5:].lower()], leave=False, desc='Images'):\n",
    "            img_name = img_path.name\n",
    "            # print(str(output_path))\n",
    "            try:\n",
    "                #find_lines(img, img_path)\n",
    "                process_image(str(img_name), str(img_path), str(output_path), markup=True)\n",
    "            except (TypeError, statistics.StatisticsError):\n",
    "                pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Speed comparison\n",
    "\n",
    "For this to work on SWAN you need to comment out header detection as it needs a newer version of Tesseract to work properly.\n",
    "\n",
    "### Local"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "process_image('N193-022_0184.tif', '/Users/tim/Dropbox/working_code/stockexchange/src/notebooks/samples/AU-NBAC-N193-022/N193-022_0184.tif', 'data', markup=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "process_image('N193-022_0184.tif', '/Users/tim/Dropbox/working_code/stockexchange/src/notebooks/samples/AU-NBAC-N193-022/N193-022_0184.tif', 'data', markup=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SWAN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "process_image('N193-022_0184.tif', '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950/AU NBAC N193-007/N193-007_0012.tif', 'data', markup=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "process_image('N193-022_0184.tif', '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950/AU NBAC N193-022/N193-022_0184.tif', 'data', markup=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}