{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test for columns\n",
"\n",
"Test an individual image and display the results"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# If running locally need to set up Cloudstor client to download images\n",
"# DON'T RUN THIS ON SWAN (or you'll get an error because webdav is not installed)\n",
"import webdav.client as wc\n",
"from credentials import * # Storing my CloudStor credentials in another file\n",
"# Set the connection options. CLOUDSTOR_USER and CLOUDSTOR_PW are stored in a separate credentials file.\n",
"options = {\n",
" 'webdav_hostname': 'https://cloudstor.aarnet.edu.au',\n",
" 'webdav_login': CLOUDSTOR_USER,\n",
" 'webdav_password': CLOUDSTOR_PW,\n",
" 'webdav_root': '/plus/remote.php/webdav/'\n",
"}\n",
"# Ok let's initiate the client.\n",
"client = wc.Client(options)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: opencv-python in /Users/tim/mycode/stock-exchange/lib/python3.7/site-packages (4.1.0.25)\n",
"Requirement already satisfied: numpy>=1.14.5 in /Users/tim/mycode/stock-exchange/lib/python3.7/site-packages (from opencv-python) (1.16.2)\n",
"\u001b[33mYou are using pip version 19.0.3, however version 19.1.1 is available.\n",
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
]
}
],
"source": [
"# If on SWAN you might need to run this to install OpenCV\n",
"!pip install --user opencv-python "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# COPIED FROM detect_columns\n",
"# MODIFIED FOR SWAN USE\n",
"\n",
"import numpy as np\n",
"import cv2\n",
"import math\n",
"import statistics\n",
"import os\n",
"import pandas as pd\n",
"import time\n",
"\n",
"# import pytesseract\n",
"from statistics import mean\n",
"import re\n",
"\n",
"# Added\n",
"from IPython.display import display, HTML\n",
"import ipywidgets as widgets\n",
"\n",
"def find_lines(img):\n",
" gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
" #th = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,5,2)\n",
" retval, th = cv2.threshold(gray,125,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n",
" cv2.imwrite('data/th.jpg',th)\n",
" kernel = np.ones((5,5),np.uint8)\n",
" median = cv2.medianBlur(th, 11)\n",
" # cv2.imwrite('data/median.jpg',median)\n",
" #eroded = cv2.erode(median, kernel, iterations=1)\n",
" #opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel)\n",
" #th = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)\n",
" opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel, iterations=1)\n",
" # cv2.imwrite('data/opened.jpg',opened)\n",
" #v = np.median(median)\n",
" #sigma = 0.33\n",
" #lower = int(max(0, (1.0 - sigma) * v))\n",
" #upper = int(min(255, (1.0 + sigma) * v))\n",
" #edges = cv2.Canny(median, lower, upper)\n",
" edges = cv2.Canny(opened,50,150,apertureSize=3)\n",
" # cv2.imwrite('data/edges.jpg',edges)\n",
" lines = cv2.HoughLinesP(image=edges,rho=1,theta=np.pi/180, threshold=200,lines=np.array([]), minLineLength=200,maxLineGap=100)\n",
" return lines\n",
"\n",
"def find_header(img):\n",
" (h, w) = img.shape[:2]\n",
" points = []\n",
" cropped = img[0:round(h/4), 0:round(w-(w/5))]\n",
" results = pytesseract.image_to_data(cropped, output_type=pytesseract.Output.DICT)\n",
" for index, word in enumerate(results['text']):\n",
" if re.search(r'Shares|Quotations|Buyers|Sellers|Business|Done', word, flags=re.IGNORECASE):\n",
" # y = results['top'][index]\n",
" points.append(results['top'][index])\n",
" #y = round(mean(points))\n",
" try:\n",
" y = sorted(points)[0]\n",
" except IndexError:\n",
" y = 0\n",
" return y\n",
" \n",
"def check_for_skew(lines):\n",
" angles = []\n",
" # lines = find_lines(img)\n",
" for line in lines:\n",
" # print(line)\n",
" for x1,y1,x2,y2 in line:\n",
" if abs(y1 - y2) > 200 and x1 > 300:\n",
" if y2 > y1:\n",
" radians = math.atan2((y2 - y1), (x2 - x1))\n",
" else:\n",
" radians = math.atan2((y1 - y2), (x1 - x2))\n",
" degrees = math.degrees(radians)\n",
" angles.append(degrees)\n",
" #print(angles)\n",
" # print(statistics.median(angles))\n",
" angle = statistics.median(angles) - 90\n",
" return angle\n",
"\n",
"def deskew(img, angle):\n",
" (h, w) = img.shape[:2]\n",
" center = (w // 2, h // 2)\n",
" M = cv2.getRotationMatrix2D(center, angle, 1.0)\n",
" rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n",
" return rotated\n",
"\n",
"def add_grid(img):\n",
" h, w = img.shape[:2]\n",
" for x in range(0, w, 100):\n",
" cv2.line(img,(x,0),(x,h),(255,0,0),1) \n",
" for y in range(0, h, 100):\n",
" cv2.line(img,(0,y),(w,y),(255,0,0),1)\n",
" return img\n",
"\n",
"def find_columns(lines, h, w):\n",
" x_values = []\n",
" for line in lines:\n",
" for x1,y1,x2,y2 in line:\n",
" top = y1 if y1 < y2 else y2\n",
" first = x1 if x1 < x2 else x2\n",
" if abs(x1 - x2) < 10 and top < (h - 600):\n",
" x_values.append(first)\n",
" x_values = sorted(x_values)\n",
" # print(x_values)\n",
" clusters = []\n",
" start = 0\n",
" distance = 10\n",
" cluster = []\n",
" for x in x_values:\n",
" if x < start + distance:\n",
" cluster.append(x)\n",
" else:\n",
" if cluster:\n",
" clusters.append(cluster)\n",
" cluster = [x]\n",
" start = x\n",
" clusters.append(cluster)\n",
" columns = []\n",
" start = 0\n",
" gutter = 0\n",
" # print(clusters)\n",
" for cluster in clusters:\n",
" if cluster and cluster[0] < 600:\n",
" if cluster[0] < 50:\n",
" gutter = 0\n",
" else:\n",
" gutter = cluster[0] - 50\n",
" start = gutter\n",
" else:\n",
" for width in reversed(range(900, 1200, 100)):\n",
" if cluster and cluster[0] > start + width and cluster[0] < (w - 600) and (cluster[0] - start) < 2000:\n",
" columns.append(mean(cluster))\n",
" start = cluster[-1]\n",
" break\n",
" return (gutter, columns)\n",
"\n",
"def resize(img, h, w):\n",
" scale = 5000 / float(w)\n",
" resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n",
" return resized\n",
"\n",
"def save_header(img, header, w, image_name, output_dir):\n",
" # numpy slicing\n",
" # roi = im[y1:y2, x1:x2]\n",
" header_dir = os.path.join(output_dir, 'headers')\n",
" header_img = img[0:header+20, 0:w]\n",
" cv2.imwrite('{}/{}-header.jpg'.format(header_dir, image_name[:-4]), header_img)\n",
" \n",
"def save_columns(img, columns, header, h, image_name, output_dir):\n",
" col_dir = os.path.join(output_dir, 'columns')\n",
" for index, column in enumerate(columns):\n",
" try:\n",
" next_col = columns[index+1]\n",
" except IndexError:\n",
" pass\n",
" else:\n",
" if column > 20:\n",
" this_col = column - 20\n",
" else:\n",
" this_col = column\n",
" col_img = img[header-20:h, this_col:next_col]\n",
" cv2.imwrite('{}/{}-col-{}.jpg'.format(col_dir, image_name[:-4], index+1), col_img)\n",
"\n",
"# THIS FUNCTION HAS BEEN MODIFIED\n",
"def process_image(image_name, image_path, output_dir='test', markup=False, grid=False):\n",
" img = cv2.imread(image_path)\n",
" # This is just to weed out dodgy images\n",
" try:\n",
" h, w = img.shape[:2]\n",
" except AttributeError:\n",
" print('Not a valid image')\n",
" else:\n",
" if w > 5000:\n",
" img = resize(img, h, w)\n",
" h, w = img.shape[:2]\n",
" lines = find_lines(img)\n",
" angle = check_for_skew(lines)\n",
" if angle != 0.0:\n",
" img = deskew(img, angle)\n",
" lines = find_lines(img)\n",
" gutter, columns = find_columns(lines, h, w)\n",
" # Header detection needs Tesseract 3.05 or greater, SWAN has 3.04\n",
" # header = find_header(img)\n",
" if grid:\n",
" img = add_grid(img)\n",
" if markup:\n",
" cv2.line(img,(gutter,0),(gutter,h),(0,255,0),3)\n",
" for column in columns:\n",
" cv2.line(img,(column,0),(column,h),(0,0,255),3)\n",
" # cv2.line(img,(0, header),(w, header),(255,0,0),3)\n",
" # This has been changed to save as a generic name\n",
" cv2.imwrite('{}/{}'.format(output_dir, image_name), img)\n",
" # Display as HTML\n",
" if set_filename:\n",
" display(HTML(''.format(output_dir, image_name, time.time())))\n",
" else:\n",
" with out:\n",
" display(HTML('
'.format(output_dir, image_name, time.time())))\n",
" else:\n",
" save_header(img, header, w, image_name, output_dir)\n",
" columns = [gutter] + columns + [w] \n",
" save_columns(img, columns, header, h, image_name, output_dir)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def download_image(image):\n",
" client.download_sync(remote_path=image['path'], local_path='test/test.jpg') \n",
"\n",
"def test_image(b):\n",
" if b:\n",
" out.clear_output()\n",
" df = pd.read_csv('files.csv')\n",
" if set_filename:\n",
" image_name = set_filename\n",
" else:\n",
" image_name = filename.value\n",
" if not image_name:\n",
" image = df.sample(1).iloc[0]\n",
" else:\n",
" image = df.loc[df['name'] == image_name].iloc[0]\n",
" print(image)\n",
" image_path = os.path.join(os.sep, 'webdav', image['path'])\n",
" if not os.path.exists(image_path):\n",
" download_image(image)\n",
" image_path = 'test/test.jpg'\n",
" process_image('test-cols.jpg', image_path, markup=True) "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Widgets don't work on SWAN at the moment\n",
"# So insert and image name below and then run the this and the next cell\n",
"set_filename = ''"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "92b75595f8f342ec8fa4cefd24f1b5bd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='
Leave box blank for a random image
'), HBox(children=(Text(value='', descript…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "directory AU NBAC N193-102/\n", "name N193-102_0259.tif\n", "path Shared/ANU-Library/Sydney Stock Exchange 1901-...\n", "Name: 32839, dtype: object\n" ] } ], "source": [ "if set_filename:\n", " test_image(None)\n", "else:\n", " out = widgets.Output()\n", " filename = widgets.Text(\n", " value='',\n", " placeholder='Enter image filename',\n", " description='Filename:',\n", " disabled=False\n", " )\n", "\n", " detect = widgets.Button(\n", " description='Detect columns',\n", " disabled=False,\n", " button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Click me',\n", " icon='check'\n", " )\n", "\n", " detect.on_click(test_image)\n", "\n", " display(widgets.VBox([widgets.HTML('Leave box blank for a random image
'), widgets.HBox([filename, detect]), out]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }