{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test for columns\n",
    "\n",
    "Test an individual image and display the results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If running locally need to set up Cloudstor client to download images\n",
    "# DON'T RUN THIS ON SWAN (or you'll get an error because webdav is not installed)\n",
    "import webdav.client as wc\n",
    "from credentials import * # Storing my CloudStor credentials in another file\n",
    "# Set the connection options. CLOUDSTOR_USER and CLOUDSTOR_PW are stored in a separate credentials file.\n",
    "options = {\n",
    "    'webdav_hostname': 'https://cloudstor.aarnet.edu.au',\n",
    "    'webdav_login':    CLOUDSTOR_USER,\n",
    "    'webdav_password': CLOUDSTOR_PW,\n",
    "    'webdav_root': '/plus/remote.php/webdav/'\n",
    "}\n",
    "# Ok let's initiate the client.\n",
    "client = wc.Client(options)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: opencv-python in /Users/tim/mycode/stock-exchange/lib/python3.7/site-packages (4.1.0.25)\n",
      "Requirement already satisfied: numpy>=1.14.5 in /Users/tim/mycode/stock-exchange/lib/python3.7/site-packages (from opencv-python) (1.16.2)\n",
      "\u001b[33mYou are using pip version 19.0.3, however version 19.1.1 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# If on SWAN you might need to run this to install OpenCV\n",
    "!pip install --user opencv-python "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COPIED FROM detect_columns\n",
    "# MODIFIED FOR SWAN USE\n",
    "\n",
    "import numpy as np\n",
    "import cv2\n",
    "import math\n",
    "import statistics\n",
    "import os\n",
    "import pandas as pd\n",
    "import time\n",
    "\n",
    "# import pytesseract\n",
    "from statistics import mean\n",
    "import re\n",
    "\n",
    "# Added\n",
    "from IPython.display import display, HTML\n",
    "import ipywidgets as widgets\n",
    "\n",
    "def find_lines(img):\n",
    "    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
    "    #th = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,5,2)\n",
    "    retval, th = cv2.threshold(gray,125,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n",
    "    cv2.imwrite('data/th.jpg',th)\n",
    "    kernel = np.ones((5,5),np.uint8)\n",
    "    median = cv2.medianBlur(th, 11)\n",
    "    # cv2.imwrite('data/median.jpg',median)\n",
    "    #eroded = cv2.erode(median, kernel, iterations=1)\n",
    "    #opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel)\n",
    "    #th = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)\n",
    "    opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel, iterations=1)\n",
    "    # cv2.imwrite('data/opened.jpg',opened)\n",
    "    #v = np.median(median)\n",
    "    #sigma = 0.33\n",
    "    #lower = int(max(0, (1.0 - sigma) * v))\n",
    "    #upper = int(min(255, (1.0 + sigma) * v))\n",
    "    #edges = cv2.Canny(median, lower, upper)\n",
    "    edges = cv2.Canny(opened,50,150,apertureSize=3)\n",
    "    # cv2.imwrite('data/edges.jpg',edges)\n",
    "    lines = cv2.HoughLinesP(image=edges,rho=1,theta=np.pi/180, threshold=200,lines=np.array([]), minLineLength=200,maxLineGap=100)\n",
    "    return lines\n",
    "\n",
    "def find_header(img):\n",
    "    (h, w) = img.shape[:2]\n",
    "    points = []\n",
    "    cropped = img[0:round(h/4), 0:round(w-(w/5))]\n",
    "    results = pytesseract.image_to_data(cropped, output_type=pytesseract.Output.DICT)\n",
    "    for index, word in enumerate(results['text']):\n",
    "        if re.search(r'Shares|Quotations|Buyers|Sellers|Business|Done', word, flags=re.IGNORECASE):\n",
    "            # y = results['top'][index]\n",
    "            points.append(results['top'][index])\n",
    "    #y = round(mean(points))\n",
    "    try:\n",
    "        y = sorted(points)[0]\n",
    "    except IndexError:\n",
    "        y = 0\n",
    "    return y\n",
    "    \n",
    "def check_for_skew(lines):\n",
    "    angles = []\n",
    "    # lines = find_lines(img)\n",
    "    for line in lines:\n",
    "        # print(line)\n",
    "        for x1,y1,x2,y2 in line:\n",
    "            if abs(y1 - y2) > 200 and x1 > 300:\n",
    "                if y2 > y1:\n",
    "                    radians = math.atan2((y2 - y1), (x2 - x1))\n",
    "                else:\n",
    "                    radians = math.atan2((y1 - y2), (x1 - x2))\n",
    "                degrees = math.degrees(radians)\n",
    "                angles.append(degrees)\n",
    "    #print(angles)\n",
    "    # print(statistics.median(angles))\n",
    "    angle = statistics.median(angles) - 90\n",
    "    return angle\n",
    "\n",
    "def deskew(img, angle):\n",
    "    (h, w) = img.shape[:2]\n",
    "    center = (w // 2, h // 2)\n",
    "    M = cv2.getRotationMatrix2D(center, angle, 1.0)\n",
    "    rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n",
    "    return rotated\n",
    "\n",
    "def add_grid(img):\n",
    "    h, w = img.shape[:2]\n",
    "    for x in range(0, w, 100):\n",
    "        cv2.line(img,(x,0),(x,h),(255,0,0),1) \n",
    "    for y in range(0, h, 100):\n",
    "        cv2.line(img,(0,y),(w,y),(255,0,0),1)\n",
    "    return img\n",
    "\n",
    "def find_columns(lines, h, w):\n",
    "    x_values = []\n",
    "    for line in lines:\n",
    "        for x1,y1,x2,y2 in line:\n",
    "            top = y1 if y1 < y2 else y2\n",
    "            first = x1 if x1 < x2 else x2\n",
    "            if abs(x1 - x2) < 10 and top < (h - 600):\n",
    "                x_values.append(first)\n",
    "    x_values = sorted(x_values)\n",
    "    # print(x_values)\n",
    "    clusters = []\n",
    "    start = 0\n",
    "    distance = 10\n",
    "    cluster = []\n",
    "    for x in x_values:\n",
    "        if x < start + distance:\n",
    "            cluster.append(x)\n",
    "        else:\n",
    "            if cluster:\n",
    "                clusters.append(cluster)\n",
    "            cluster = [x]\n",
    "        start = x\n",
    "    clusters.append(cluster)\n",
    "    columns = []\n",
    "    start = 0\n",
    "    gutter = 0\n",
    "    # print(clusters)\n",
    "    for cluster in clusters:\n",
    "        if cluster and cluster[0] < 600:\n",
    "            if cluster[0] < 50:\n",
    "                gutter = 0\n",
    "            else:\n",
    "                gutter = cluster[0] - 50\n",
    "            start = gutter\n",
    "        else:\n",
    "            for width in reversed(range(900, 1200, 100)):\n",
    "                if cluster and cluster[0] > start + width and cluster[0] < (w - 600) and (cluster[0] - start) < 2000:\n",
    "                    columns.append(mean(cluster))\n",
    "                    start = cluster[-1]\n",
    "                    break\n",
    "    return (gutter, columns)\n",
    "\n",
    "def resize(img, h, w):\n",
    "    scale = 5000 / float(w)\n",
    "    resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n",
    "    return resized\n",
    "\n",
    "def save_header(img, header, w, image_name, output_dir):\n",
    "    # numpy slicing\n",
    "    # roi = im[y1:y2, x1:x2]\n",
    "    header_dir = os.path.join(output_dir, 'headers')\n",
    "    header_img = img[0:header+20, 0:w]\n",
    "    cv2.imwrite('{}/{}-header.jpg'.format(header_dir, image_name[:-4]), header_img)\n",
    "    \n",
    "def save_columns(img, columns, header, h, image_name, output_dir):\n",
    "    col_dir = os.path.join(output_dir, 'columns')\n",
    "    for index, column in enumerate(columns):\n",
    "        try:\n",
    "            next_col = columns[index+1]\n",
    "        except IndexError:\n",
    "            pass\n",
    "        else:\n",
    "            if column > 20:\n",
    "                this_col = column - 20\n",
    "            else:\n",
    "                this_col = column\n",
    "            col_img = img[header-20:h, this_col:next_col]\n",
    "            cv2.imwrite('{}/{}-col-{}.jpg'.format(col_dir, image_name[:-4], index+1), col_img)\n",
    "\n",
    "# THIS FUNCTION HAS BEEN MODIFIED\n",
    "def process_image(image_name, image_path, output_dir='test', markup=False, grid=False):\n",
    "    img = cv2.imread(image_path)\n",
    "    # This is just to weed out dodgy images\n",
    "    try:\n",
    "        h, w = img.shape[:2]\n",
    "    except AttributeError:\n",
    "        print('Not a valid image')\n",
    "    else:\n",
    "        if w > 5000:\n",
    "            img = resize(img, h, w)\n",
    "            h, w = img.shape[:2]\n",
    "        lines = find_lines(img)\n",
    "        angle = check_for_skew(lines)\n",
    "        if angle != 0.0:\n",
    "            img = deskew(img, angle)\n",
    "            lines = find_lines(img)\n",
    "        gutter, columns = find_columns(lines, h, w)\n",
    "        # Header detection needs Tesseract 3.05 or greater, SWAN has 3.04\n",
    "        # header = find_header(img)\n",
    "        if grid:\n",
    "            img = add_grid(img)\n",
    "        if markup:\n",
    "            cv2.line(img,(gutter,0),(gutter,h),(0,255,0),3)\n",
    "            for column in columns:\n",
    "                cv2.line(img,(column,0),(column,h),(0,0,255),3)\n",
    "            # cv2.line(img,(0, header),(w, header),(255,0,0),3)\n",
    "            # This has been changed to save as a generic name\n",
    "            cv2.imwrite('{}/{}'.format(output_dir, image_name), img)\n",
    "            # Display as HTML\n",
    "            if set_filename:\n",
    "                display(HTML('<img src=\"{}/{}?{}\">'.format(output_dir, image_name, time.time())))\n",
    "            else:\n",
    "                with out:\n",
    "                    display(HTML('<img src=\"{}/{}?{}\">'.format(output_dir, image_name, time.time())))\n",
    "        else:\n",
    "            save_header(img, header, w, image_name, output_dir)\n",
    "            columns = [gutter] + columns + [w] \n",
    "            save_columns(img, columns, header, h, image_name, output_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_image(image):\n",
    "    client.download_sync(remote_path=image['path'], local_path='test/test.jpg')  \n",
    "\n",
    "def test_image(b):\n",
    "    if b:\n",
    "        out.clear_output()\n",
    "    df = pd.read_csv('files.csv')\n",
    "    if set_filename:\n",
    "        image_name = set_filename\n",
    "    else:\n",
    "        image_name = filename.value\n",
    "    if not image_name:\n",
    "        image = df.sample(1).iloc[0]\n",
    "    else:\n",
    "        image = df.loc[df['name'] == image_name].iloc[0]\n",
    "    print(image)\n",
    "    image_path = os.path.join(os.sep, 'webdav', image['path'])\n",
    "    if not os.path.exists(image_path):\n",
    "        download_image(image)\n",
    "        image_path = 'test/test.jpg'\n",
    "    process_image('test-cols.jpg', image_path, markup=True)     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Widgets don't work on SWAN at the moment\n",
    "# So insert and image name below and then run the this and the next cell\n",
    "set_filename = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "92b75595f8f342ec8fa4cefd24f1b5bd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<p>Leave box blank for a random image</p>'), HBox(children=(Text(value='', descript…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "directory                                    AU NBAC N193-102/\n",
      "name                                         N193-102_0259.tif\n",
      "path         Shared/ANU-Library/Sydney Stock Exchange 1901-...\n",
      "Name: 32839, dtype: object\n"
     ]
    }
   ],
   "source": [
    "if set_filename:\n",
    "    test_image(None)\n",
    "else:\n",
    "    out = widgets.Output()\n",
    "    filename = widgets.Text(\n",
    "            value='',\n",
    "            placeholder='Enter image filename',\n",
    "            description='Filename:',\n",
    "            disabled=False\n",
    "        )\n",
    "\n",
    "    detect = widgets.Button(\n",
    "        description='Detect columns',\n",
    "        disabled=False,\n",
    "        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n",
    "        tooltip='Click me',\n",
    "        icon='check'\n",
    "    )\n",
    "\n",
    "    detect.on_click(test_image)\n",
    "\n",
    "    display(widgets.VBox([widgets.HTML('<p>Leave box blank for a random image</p>'), widgets.HBox([filename, detect]), out]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}