{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: opencv-python in /scratch/.local/lib/python3.6/site-packages (4.1.1.26)\n", "Requirement already satisfied: numpy>=1.11.3 in /cvmfs/sft.cern.ch/lcg/views/LCG_96python3/x86_64-centos7-gcc8-opt/lib/python3.6/site-packages (from opencv-python) (1.16.4)\n", "Requirement already satisfied: pytesseract in /scratch/.local/lib/python3.6/site-packages (0.3.0)\n", "Requirement already satisfied: Pillow in /cvmfs/sft.cern.ch/lcg/views/LCG_96python3/x86_64-centos7-gcc8-opt/lib/python3.6/site-packages (from pytesseract) (6.0.0)\n", "Collecting tqdm\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/dc/88/d3213e2f3492daf09d8b41631ad6899f56db17ce83ea9c8a579902bafe5e/tqdm-4.35.0-py2.py3-none-any.whl (50kB)\n", "\u001b[K |████████████████████████████████| 51kB 7.1MB/s eta 0:00:011\n", "\u001b[?25hInstalling collected packages: tqdm\n", "Successfully installed tqdm-4.35.0\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install --user opencv-python\n", "!{sys.executable} -m pip install --user pytesseract\n", "!{sys.executable} -m pip install --user tqdm" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import cv2\n", "import math\n", "import statistics\n", "import os\n", "try:\n", " from PIL import Image\n", "except ImportError:\n", " import Image\n", "import pytesseract\n", "from tqdm.auto import tqdm\n", "from statistics import mean\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def find_lines(img):\n", " '''\n", " Find straight lines in an image.\n", " Returns a list of lines.\n", " \n", " These settings have been arrived at after much trial and error.\n", " '''\n", " # Convert to grayscale\n", " gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n", " \n", " # Theshold image (convert to black and white)\n", " retval, th = cv2.threshold(gray,125,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n", " # cv2.imwrite('data/th.jpg',th)\n", " \n", " # Use median blur to get rid of a lot of the text\n", " median = cv2.medianBlur(th, 11)\n", " # cv2.imwrite('data/median.jpg',median)\n", " \n", " # Try to strengthen the remaining lines\n", " kernel = np.ones((5,5),np.uint8)\n", " opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel, iterations=1)\n", " # cv2.imwrite('data/opened.jpg',opened)\n", "\n", " # Find the edges of the remaining shapes\n", " edges = cv2.Canny(opened,50,150,apertureSize=3)\n", " # cv2.imwrite('data/edges.jpg',edges)\n", " \n", " # Find straight lines in the edges\n", " # Note that the minLineLength and maxLineGap values can have a dramatic effect on the number of lines detected.\n", " # Note too that what looks to the human eye as a single straight line,\n", " # can actually be a series of short line with tiny gaps between them,\n", " # so while increasing the minLineLength reduces noise from text, it can also filter out columns.\n", " lines = cv2.HoughLinesP(image=edges,rho=1,theta=np.pi/180, threshold=200,lines=np.array([]), minLineLength=100,maxLineGap=100)\n", " return lines\n", "\n", "\n", "def find_header(img):\n", " '''\n", " Locate the page header, and return the y position of the bottom.\n", " Note that in this case using OCR seemed easier than trying to sort/cluster lines.\n", " '''\n", " \n", " # Image dimensions\n", " (h, w) = img.shape[:2]\n", " points = []\n", " \n", " # The header will always be at the top, so crop off the top of the image, rather than OCRing the whole thing\n", " cropped = img[0:round(h/4), 0:round(w-(w/4))]\n", " \n", " # I should probably do some image preprocessing here (see row detection notebook)\n", " # Get OCR results\n", " results = pytesseract.image_to_data(cropped, output_type=pytesseract.Output.DICT)\n", " \n", " # Loop through the OCR results looking for specific words that appear in the header\n", " for index, word in enumerate(results['text']):\n", " # These are the words that appear in the header\n", " # If we find one, add its 'top' value to our list of points\n", " if re.search(r'Shares|Quotations|Buyers|Sellers|Business|Done|Closing prices|Sales', word, flags=re.IGNORECASE) and results['height'][index] < 50:\n", " # y = results['top'][index]\n", " points.append(results['top'][index])\n", " # y = round(mean(points))\n", " # print(points)\n", " \n", " try:\n", " # Sort points and get the first\n", " y = sorted(points)[0]\n", " except IndexError:\n", " # If we can't find a header return 0\n", " y = 0\n", " \n", " # Return y value of header\n", " return y\n", " \n", "\n", "def check_for_skew(lines):\n", " '''\n", " Check for skewing by looking at the near vertical lines detected in the image.\n", " '''\n", " angles = []\n", " \n", " # Loop through detected lines\n", " for line in lines:\n", " # Get coords of line\n", " for x1,y1,x2,y2 in line:\n", " # Ignore short lines and lines in header\n", " if abs(y1 - y2) > 150 and x1 > 300:\n", " # Get the angle of the line\n", " if y2 > y1:\n", " radians = math.atan2((y2 - y1), (x2 - x1))\n", " else:\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = math.degrees(radians)\n", " # print(degrees)\n", " # If it's vertical-ish, save this angle\n", " if degrees >= 80 and degrees <= 100:\n", " angles.append(degrees)\n", " # Get the media of the saved angles\n", " angle = statistics.median(angles) - 90\n", " # print(angle)\n", " return angle\n", "\n", "\n", "def deskew(img, angle):\n", " '''\n", " Deskew image by rotating it by the supplied angle.\n", " '''\n", " # Get image dimensions\n", " (h, w) = img.shape[:2]\n", " \n", " # Get the centre of the image\n", " center = (w // 2, h // 2)\n", " \n", " # Rotate image by angle\n", " M = cv2.getRotationMatrix2D(center, angle, 1.0)\n", " rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n", " \n", " # Return the rotated image\n", " return rotated\n", "\n", "\n", "def add_grid(img):\n", " '''\n", " Draws a 100 x 100px grid on image.\n", " Can be useful in interpreting column detection results.\n", " '''\n", " h, w = img.shape[:2]\n", " for x in range(0, w, 100):\n", " cv2.line(img,(x,0),(x,h),(255,0,0),1) \n", " for y in range(0, h, 100):\n", " cv2.line(img,(0,y),(w,y),(255,0,0),1)\n", " return img\n", "\n", "\n", "def find_top(lines):\n", " '''\n", " Use horizontal lines near the top of the page to provide an approximation of the header height.\n", " Used to crop page to ignore lines in header area.\n", " More accurate header location is found using Tesseract.\n", " '''\n", " top = 0\n", " y_values = []\n", " for line in lines:\n", " for x1,y1,x2,y2 in line:\n", " bottom = y1 if y1 > y2 else y2\n", " if bottom < 1000:\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = math.degrees(radians)\n", " if degrees > 179 and degrees < 181:\n", " y_values.append(bottom)\n", " if y_values:\n", " top = sorted(y_values)[-1]\n", " return top \n", "\n", "\n", "def find_columns(lines, h, w):\n", " '''\n", " Identifies most likely column values from within the set of straight lines in an image.\n", " This could do with some cleaning up, but it's working well at the moment, so I don't really want to fiddle any more.\n", " Note that this does depend on some knowledge of the images to define ranges of expected values.\n", " '''\n", " x_values = []\n", " \n", " # Get the approximate position of the header so we can ignore lines above this\n", " column_top = find_top(lines)\n", "\n", " # Find the x values of vertical lines\n", " for line in lines:\n", " for x1,y1,x2,y2 in line:\n", " \n", " # Find the top\n", " top = y1 if y1 < y2 else y2\n", " \n", " # Ignore column lines at the top & bottom of the image\n", " if top > column_top and top < (h - 600):\n", "\n", " # Find the leftmost point\n", " first = x1 if x1 < x2 else x2\n", " \n", " # Find the angle of the line\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = abs(math.degrees(radians))\n", " \n", " # If the line is (close to) vertical, we'll save the left-most x value\n", " if degrees >= 89 and degrees <= 91:\n", " x_values.append(first)\n", " \n", " # Sort the x_values\n", " x_values = sorted(x_values)\n", " \n", " # Cluster together values within the specified distance\n", " clusters = []\n", " start = 0\n", " # Lines less than this distance apart will be clustered\n", " distance = 10\n", " cluster = []\n", " \n", " # Loop through x values\n", " for x in x_values:\n", " \n", " # If the x value is less than the specified distance from the previous point,\n", " # we'll add it to the current cluster\n", " if x < start + distance:\n", " cluster.append(x)\n", " \n", " # If not we'll save the current cluster, and start a new one\n", " else:\n", " if cluster:\n", " # Add the current cluster to the list of clusters\n", " clusters.append(cluster)\n", " \n", " # Start a new cluster at the current point\n", " cluster = [x]\n", " \n", " # Set the current position\n", " start = x\n", " \n", " # Add the last cluster once we've finished the loop\n", " clusters.append(cluster)\n", "\n", " # Now we have a list of clustered x values\n", " # We'll compare nearby clusters and keep the ones with the most values (most likely to be columns)\n", " best_clusters = [[0]]\n", " \n", " # Loop through clusters\n", " for cluster in clusters:\n", " \n", " # If the current cluster is within 200px of the previous one\n", " if cluster[0] < best_clusters[-1][-1] + 200:\n", " \n", " # Check to see which cluster contains the most values\n", " # If it's the current one we'll add it to our best clusters\n", " if len(cluster) > len(best_clusters[-1]):\n", " \n", " # Remove the previous cluster from best clusters\n", " best_clusters.pop()\n", " \n", " # Add this one\n", " best_clusters.append(cluster)\n", " \n", " # If this cluster isn't near the previous one, add it to best clusters\n", " else:\n", " best_clusters.append(cluster)\n", " \n", " # Now we have our best candidates for columns in best clusters\n", " # We'll do some further filtering by checking the clusters against our expectations of column positions\n", " # The pixel values used below are based on trial and error with the Stock Exchange images\n", " # Obviously if you were using this on other images you'd want to adjust them accordingly\n", " \n", " columns = []\n", " start = 0\n", " gutter = 0\n", " gap = None\n", " \n", " # Loop through our best clusters\n", " for cluster in best_clusters:\n", " \n", " # If the leftmost point in this cluster is less than 600 then it's the gutter\n", " if cluster and cluster[0] < 600:\n", " \n", " # Set the gutter value to a mean of the clustered points\n", " gutter = mean(cluster)\n", " \n", " # Sometimes the gutter isn't detected, so we'll set a reasonable start position\n", " if gutter < 100:\n", " start = 100\n", " else:\n", " start = gutter\n", " else:\n", " # Checking the gap between this cluster and the previous one\n", " if gap:\n", " this_gap = gap\n", " else:\n", " \n", " # Current gap is the leftmost point of this cluster minus the previous column position\n", " this_gap = cluster[0] - start\n", " \n", " # This range represents approximate min/max column widths\n", " # We'll look for columns at 100 px intervals starting from the max value until we hit the min value\n", " for width in reversed(range(900, 2000, 100)):\n", " \n", " # Try to make sure columns are roughly the same width\n", " if (cluster[0] - start) > (this_gap - 600) and (cluster[0] - start) < (this_gap + 600):\n", " \n", " # If cluster falls within expected values, we'll assume it's a column\n", " if cluster and cluster[0] > (start + width) and cluster[0] < (w - 600) and this_gap < 2600:\n", " \n", " # Save mean of clustered values as column\n", " columns.append(mean(cluster))\n", " \n", " # Set the next start value to value of the last point in cluster\n", " start = cluster[-1]\n", " gap = this_gap\n", " \n", " # Don't look for any more columns in this cluster\n", " break\n", " #print(gutter)\n", " #print(columns)\n", " return (gutter, columns)\n", "\n", "\n", "def resize(img, h, w):\n", " '''\n", " Resize image to a max width of 5000 px.\n", " '''\n", " # Find the scale to use, based on max width\n", " scale = 5000 / float(w)\n", " \n", " # Resize the image\n", " resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n", " return resized\n", "\n", "\n", "def save_header(img, header, w, image_name, output_dir):\n", " '''\n", " Save the detected header as a separate image.\n", " '''\n", " \n", " # Crop the image using header value\n", " # Numpy slicing - roi = im[y1:y2, x1:x2]\n", " header_img = img[0:header+20, 0:w]\n", " \n", " # Find the scale to use, based on max width\n", " scale = 2000 / float(w)\n", " \n", " # Resize the image\n", " resized = cv2.resize(header_img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n", " \n", " # Save the cropped image\n", " cv2.imwrite('{}/{}-header.jpg'.format(output_dir, image_name[:-4]), resized)\n", "\n", " \n", "def save_columns(img, columns, header, h, image_name, output_dir):\n", " '''\n", " Save each detected column as a separate image.\n", " Note that the columns list should include the gutter at the beginning and the image width at the end.\n", " '''\n", " \n", " # Where to save the images\n", " col_dir = os.path.join(output_dir, 'columns')\n", " \n", " # Loop through the column values\n", " for index, column in enumerate(columns):\n", " \n", " # Get the value of the next column to use as the width of the cropped column\n", " try:\n", " next_col = columns[index+1]\n", " except IndexError:\n", " \n", " # If there's no next column we've reached the end of the image, so do nothing\n", " pass\n", " else:\n", " \n", " # Add a little to the margins of the image\n", " if column > 20:\n", " this_col = column - 20\n", " else:\n", " this_col = column\n", " \n", " # Crop the image to the dimensions of the column\n", " col_img = img[header-20:h, this_col:next_col]\n", " \n", " # Save the cropped image, using the undex value to denote column order\n", " cv2.imwrite('{}/{}-col-{}.jpg'.format(col_dir, image_name[:-4], index+1), col_img)\n", " \n", " \n", "def display_lines(image_name, output_dir, img, lines):\n", " '''\n", " For testing / debugging - shows ALL the detected lines\n", " '''\n", " for line in lines:\n", " #print(line)\n", " x1,y1,x2,y2 = line[0]\n", " cv2.line(img,(x1,y1),(x2,y2),(0,0,255),8)\n", " #cv2.imwrite('{}/{}-lines.jpg'.format(output_dir, image_name[:-4]), img)\n", " \n", "\n", "def process_image(image_name, image_path, output_dir='test', markup=False, grid=False):\n", " '''\n", " Detect columns and header in the supplied image.\n", " \n", " Parameters:\n", " image_name\n", " image_path\n", " output_dir (must exist)\n", " markup – if True, draw the results on the image, if False crop and save the detected regions.\n", " grid – if True, draw a grid on the image\n", " '''\n", " \n", " img = cv2.imread(image_path)\n", " \n", " # Get image dimensions\n", " try:\n", " h, w = img.shape[:2]\n", " \n", " # Weed out dodgy images\n", " except AttributeError:\n", " print('Not a valid image: {}'.format(image_path))\n", " \n", " # If it looks ok, then proceed...\n", " else:\n", " \n", " # To standardise things a little, we'll resize images with a width greater than 5000\n", " if w > 5000:\n", " img = resize(img, h, w)\n", " \n", " # Get the new dimensions\n", " h, w = img.shape[:2]\n", " \n", " # Detect stratight lines in the image\n", " lines = find_lines(img)\n", " \n", " # Use the detected lines to check for skew\n", " # I'm not actually sure if these deskewing steps are useful\n", " angle = check_for_skew(lines)\n", " \n", " # If image seems to be skewed, then deskew!\n", " if angle != 0:\n", " # print('Deskewing')\n", " img = deskew(img, angle)\n", " \n", " # Once deskewed we have to redo line detection because positions will have changed\n", " lines = find_lines(img)\n", " \n", " #display_lines(image_name, output_dir, img, lines)\n", " \n", " # Filter the detected lines to identify columns\n", " # gutter, columns = find_columns(lines, h, w)\n", " \n", " # Find the bottom of the header\n", " header = find_header(img)\n", " \n", " # Otherwise crop & save columns and header\n", " save_header(img, header, w, image_name, output_dir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "079c640cfcd14dfa8188f9b4554b7033", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Directories', max=199, style=ProgressStyle(description_width=…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a4d7ead9f3054240b20de955a2b08e1d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=1, bar_style='info', description='Images', max=1, style=ProgressStyle(descrip…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e85fb90228034a03a67a0e5cbe2022e5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=303, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a2ca46905b4f42c8b8ae7ff56e4f3ef5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=312, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2584582ea7e8442bb2bdecb602fd1718", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=345, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "946c7b910f5947128c0eb0ebffa2260f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=312, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ab4e2fc6867846b391984b9c96f02e8a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=305, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b96ca3718b2b4832970af287785c1dc9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=334, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9efd5d19472e42169ebc160c224058f4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=350, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Not a valid image: /scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950/AU NBAC N193-007/.sys.v#.N193-007_0012.tif\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e2863f3bf59f499586c59660eb2dff53", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=318, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d466e85f8b0e4ff5bd720c758e7dedbd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=327, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3e08c973a06247039f8536a1041290df", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=327, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7b01c3e4a8614d9080a6d4a658687b06", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=350, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "187e92e9d1ed4c38a80a363c3006f54f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=310, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e14c199e42f146119a3014bf7dabdc47", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=330, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "eff2ccf7c4a74adaa9d1ea246d99eea1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=349, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7858ba74e48e41ac8120a03054e32fb0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=313, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fa8d3088981e476ca24904c7c5255fe6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=331, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "af49a9cbf99c422f80aa3be151befde3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=322, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "755f7978517d44bf9001d731d934cd15", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=348, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4714b8d069554553be3678826d1a02fc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=312, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8598e1e072e64dc18e63ff27766eac87", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=330, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c9aee6603511441fa8b550877418bbf0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=314, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "89452a33a92a402184350e5bef0a8fdd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=344, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7222b5aec86546c5aefe292b48c0fc5c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=310, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6f5b7cf7b83f4cc1a578f3bcbc8a6039", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=323, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8879f9b7213c43bd826d7a6c5ffe7dbb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=332, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "77a0ef16ffa24495a65000cff35af918", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=349, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c10317a7ac154c53a22fb79c17a39e75", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=314, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "335f8f7cbe4a479e99e1a4af8dc544f3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=328, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8037e129ef144944851ae36c8e15b39b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=327, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "357f19d19b9e4db991bf0eba99359d82", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=339, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9145ba2b18844b328ae62c6847c54fa3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=316, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6d235b1054d14525baf1d4fdd1f328d9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=329, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Directory of directories\n", "dir_path = '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950'\n", "\n", "# This is where the processed images should go\n", "# output_dir = 'processed/samples'\n", "# os.makedirs(os.path.join(output_dir, 'columns'), exist_ok=True)\n", "os.makedirs(os.path.join('all_headers'), exist_ok=True)\n", "\n", "# Loop through directories\n", "for img_dir in tqdm([d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d)) and d not in ['AU NBAC N193-014']], desc='Directories'):\n", " output_dir = os.path.join('all_headers', img_dir)\n", " os.makedirs(output_dir, exist_ok=True)\n", " # Loop through images with .tif(f) extension\n", " for img_name in tqdm([i for i in os.listdir(os.path.join(dir_path, img_dir)) if '.tif' in i[-5:].lower()], leave=False, desc='Images'):\n", " # print(img_name)\n", " img_path = os.path.join(dir_path, img_dir, img_name)\n", " output_img = os.path.join(output_dir, '{}-header.jpg'.format(img_name.split('.')[0]))\n", " if not os.path.exists(output_img):\n", " try:\n", " #find_lines(img, img_path)\n", " process_image(img_name, img_path, output_dir, markup=False)\n", " except (TypeError, statistics.StatisticsError):\n", " raise" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Directory of directories\n", "dir_path = '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950'\n", "\n", "def process_dir(img_dir):\n", " output_dir = os.path.join('all_headers', img_dir)\n", " os.makedirs(output_dir, exist_ok=True)\n", " # Loop through images with .tif(f) extension\n", " for img_name in tqdm([i for i in os.listdir(os.path.join(dir_path, img_dir)) if '.tif' in i[-5:].lower()], leave=False, desc='Images'):\n", " # print(img_name)\n", " img_path = os.path.join(dir_path, img_dir, img_name)\n", " output_img = os.path.join(output_dir, '{}-header.jpg'.format(output_dir, img_name[:-4]))\n", " if not os.path.exists(output_img):\n", " try:\n", " #find_lines(img, img_path)\n", " process_image(img_name, img_path, output_dir, markup=False)\n", " except (TypeError, statistics.StatisticsError):\n", " raise" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "79ce437cbaec4a90b3565f8c43a6beba", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=390, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\r" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ea7c3639f86b4920a1711b7a9d8ba722", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Images', max=605, style=ProgressStyle(description_width='init…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for img_dir in ['AU NBAC N193-135', 'AU NBAC N193-146', 'AU NBAC N193-165', 'AU NBAC N193-191']:\n", " process_dir(img_dir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 4 }