{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Find columns and headers\n", "\n", "Process Stock Exchange images, detecting the positions of columns and headers." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import cv2\n", "import math\n", "import statistics\n", "import os\n", "try:\n", " from PIL import Image\n", "except ImportError:\n", " import Image\n", "import pytesseract\n", "from tqdm.auto import tqdm\n", "from statistics import mean\n", "import re\n", "from fuzzywuzzy import fuzz\n", "import tempfile\n", "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# These OCR image preprocessing steps are based on https://stackoverflow.com/a/43493383\n", "# I don't really understand why this particular combination of filters works, but it does seem to improve OCR results\n", "\n", "BINARY_THRESHOLD = 200\n", "\n", "def process_image_for_ocr(file_path):\n", " # TODO : Implement using opencv\n", " temp_filename = set_image_dpi(file_path)\n", " im_new = remove_noise_and_smooth(temp_filename)\n", " return im_new\n", "\n", "\n", "def set_image_dpi(file_path):\n", " im = Image.open(file_path)\n", " length_x, width_y = im.size\n", " factor = max(1, int(5000 / length_x))\n", " size = factor * length_x, factor * width_y\n", " # size = (1800, 1800)\n", " im_resized = im.resize(size, Image.ANTIALIAS)\n", " temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')\n", " temp_filename = temp_file.name\n", " im.save(temp_filename, dpi=(300, 300))\n", " return temp_filename\n", "\n", "\n", "def image_smoothening(img):\n", " ret1, th1 = cv2.threshold(img, BINARY_THRESHOLD, 255, cv2.THRESH_BINARY)\n", " ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n", " blur = cv2.GaussianBlur(th2, (1, 1), 0)\n", " ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n", " return th3\n", "\n", "\n", "def remove_noise_and_smooth(file_name):\n", " img = cv2.imread(file_name, 0)\n", " # gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n", " filtered = cv2.adaptiveThreshold(img.astype(np.uint8), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 41, 3)\n", " kernel = np.ones((1, 1), np.uint8)\n", " opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)\n", " closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)\n", " img = image_smoothening(img)\n", " or_image = cv2.bitwise_or(img, closing)\n", " (h, w) = or_image.shape[:2]\n", " img = resize(or_image, h, w)\n", " return img" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def find_lines(img):\n", " '''\n", " Find straight lines in an image.\n", " Returns a list of lines.\n", " \n", " These settings have been arrived at after much trial and error.\n", " '''\n", " # Convert to grayscale\n", " gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n", " \n", " # Theshold image (convert to black and white)\n", " retval, th = cv2.threshold(gray,125,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n", " # cv2.imwrite('data/th.jpg',th)\n", " \n", " # Use median blur to get rid of a lot of the text\n", " median = cv2.medianBlur(th, 11)\n", " # cv2.imwrite('data/median.jpg',median)\n", " \n", " # Try to strengthen the remaining lines\n", " kernel = np.ones((5,5),np.uint8)\n", " opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel, iterations=1)\n", " # cv2.imwrite('data/opened.jpg',opened)\n", "\n", " # Find the edges of the remaining shapes\n", " v = np.median(gray)\n", " lower = int(max(0, (1.0 - 0.33) * v))\n", " upper = int(min(255, (1.0 + 0.33) * v))\n", " # edges = cv2.Canny(opened,50,150,apertureSize=3)\n", " edges = cv2.Canny(opened,lower,upper,apertureSize=3)\n", " # cv2.imwrite('data/edges.jpg',edges)\n", " \n", " # Find straight lines in the edges\n", " # Note that the minLineLength and maxLineGap values can have a dramatic effect on the number of lines detected.\n", " # Note too that what looks to the human eye as a single straight line,\n", " # can actually be a series of short line with tiny gaps between them,\n", " # so while increasing the minLineLength reduces noise from text, it can also filter out columns.\n", " lines = cv2.HoughLinesP(image=edges,rho=1,theta=np.pi/180, threshold=200,lines=np.array([]), minLineLength=100,maxLineGap=100)\n", " # print(lines)\n", " return lines\n", "\n", "def find_margin(df):\n", " return int(round(df.loc[(df['level'] == 4) & (df['left'] < 150)]['left'].mean()))\n", "\n", "def find_col_width(df):\n", " candidates = []\n", " for confidence in reversed(range(80, 110, 10)):\n", " for heading in ['buyers', 'closing', 'quotations']:\n", " for word in df.loc[(df['level'] == 5) & (df['left'] < 1500)].sort_values(by='top').itertuples():\n", " # print(word.text.lower())\n", " if len(str(word.text)) > 5 and fuzz.partial_ratio(heading, word.text.lower()) >= confidence:\n", " # print(word)\n", " # print(fuzz.ratio('buyers', word.text.lower()))\n", " if word.left > 625:\n", " candidates.append(word.left)\n", " return candidates\n", "\n", "def find_header_height(df):\n", " candidates = []\n", " for confidence in reversed(range(80, 110, 10)):\n", " for heading in ['shares', 'closing', 'sales', 'quotations', 'buyers', 'sellers', 'business']:\n", " for word in df.loc[(df['level'] == 5) & (df['top'] < 1750) & (df['left'] < 3750) & (df['height'] < 90)].sort_values(by=['top', 'left']).itertuples():\n", " if len(str(word.text)) > 5 and fuzz.partial_ratio(heading, word.text.lower()) >= confidence:\n", " # print(word)\n", " # print(fuzz.partial_ratio(heading, word.text.lower()))\n", " #return word.top\n", " candidates.append(word.top)\n", " return candidates\n", " \n", "def find_header(img_path):\n", " # Image dimensions\n", " img = process_image_for_ocr(img_path)\n", " (h, w) = img.shape[:2]\n", " points = []\n", " # The header will always be at the top, so crop off the top of the image, rather than OCRing the whole thing\n", " cropped = img[0:1750, 0:w]\n", " col_widths = []\n", " header_heights = []\n", " # The psm settings can greatly effect the results, but they're unpredictable\n", " # Sometimes one setting works better than the other, I don't know why\n", " # So we're going to try them both and look for the best result.\n", " for psm in [4, 6]:\n", " df = pytesseract.image_to_data(cropped, config=f'--psm {psm} --oem 1 -l eng', output_type=pytesseract.Output.DATAFRAME)\n", " col_widths += find_col_width(df)\n", " # print(df.loc[(df['level'] == 5) & (df['left'] < 3750)].sort_values(by=['top', 'left']).to_dict('records'))\n", " header_heights += find_header_height(df)\n", " # margin = find_margin(df)\n", " try:\n", " # header_height = sorted(header_heights)[0]\n", " header_height = int(statistics.median(header_heights))\n", " except (IndexError, statistics.StatisticsError):\n", " header_height = 0\n", " try:\n", " # col_width = sorted(col_widths)[0] - 10\n", " col_width = int(statistics.median(col_widths))\n", " except (IndexError, statistics.StatisticsError):\n", " col_width = 0\n", " # print(col_width, header_height)\n", " return (col_width, header_height)\n", "\n", "\n", "def check_for_skew(lines):\n", " '''\n", " Check for skewing by looking at the near vertical lines detected in the image.\n", " '''\n", " angles = []\n", " \n", " # Loop through detected lines\n", " for line in lines:\n", " # Get coords of line\n", " for x1,y1,x2,y2 in line:\n", " # Ignore short lines and lines in header\n", " if abs(y1 - y2) > 150 and x1 > 300:\n", " # Get the angle of the line\n", " if y2 > y1:\n", " radians = math.atan2((y2 - y1), (x2 - x1))\n", " else:\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = math.degrees(radians)\n", " # print(degrees)\n", " # If it's vertical-ish, save this angle\n", " if degrees >= 80 and degrees <= 100:\n", " angles.append(degrees)\n", " # Get the media of the saved angles\n", " angle = statistics.median(angles) - 90\n", " # print(angle)\n", " return angle\n", "\n", "\n", "def deskew(img, angle):\n", " '''\n", " Deskew image by rotating it by the supplied angle.\n", " '''\n", " # Get image dimensions\n", " (h, w) = img.shape[:2]\n", " \n", " # Get the centre of the image\n", " center = (w // 2, h // 2)\n", " \n", " # Rotate image by angle\n", " M = cv2.getRotationMatrix2D(center, angle, 1.0)\n", " rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n", " \n", " # Return the rotated image\n", " return rotated\n", "\n", "\n", "def add_grid(img):\n", " '''\n", " Draws a 100 x 100px grid on image.\n", " Can be useful in interpreting column detection results.\n", " '''\n", " h, w = img.shape[:2]\n", " for x in range(0, w, 100):\n", " cv2.line(img,(x,0),(x,h),(255,0,0),1) \n", " for y in range(0, h, 100):\n", " cv2.line(img,(0,y),(w,y),(255,0,0),1)\n", " return img\n", "\n", "\n", "def find_top(lines):\n", " '''\n", " Use horizontal lines near the top of the page to provide an approximation of the header height.\n", " Used to crop page to ignore lines in header area.\n", " More accurate header location is found using Tesseract.\n", " '''\n", " top = 0\n", " y_values = []\n", " for line in lines:\n", " for x1,y1,x2,y2 in line:\n", " bottom = y1 if y1 > y2 else y2\n", " if bottom < 1000:\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = math.degrees(radians)\n", " if degrees > 179 and degrees < 181:\n", " y_values.append(bottom)\n", " if y_values:\n", " top = sorted(y_values)[-1]\n", " return top \n", "\n", "\n", "def find_columns(lines, h, w, column_top, col_width):\n", " '''\n", " Identifies most likely column values from within the set of straight lines in an image.\n", " This could do with some cleaning up, but it's working well at the moment, so I don't really want to fiddle any more.\n", " Note that this does depend on some knowledge of the images to define ranges of expected values.\n", " '''\n", " x_values = []\n", " \n", " # Get the approximate position of the header so we can ignore lines above this\n", " column_top = find_top(lines)\n", "\n", " # Find the x values of vertical lines\n", " for line in lines:\n", " for x1,y1,x2,y2 in line:\n", " \n", " # Find the top\n", " top = y1 if y1 < y2 else y2\n", " \n", " # Ignore column lines at the top & bottom of the image\n", " if top > column_top and top < (h - 600):\n", "\n", " # Find the leftmost point\n", " first = x1 if x1 < x2 else x2\n", " \n", " # Find the angle of the line\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = abs(math.degrees(radians))\n", " \n", " # If the line is (close to) vertical, we'll save the left-most x value\n", " if degrees >= 89 and degrees <= 91:\n", " x_values.append(first)\n", " \n", " # Sort the x_values\n", " x_values = sorted(x_values)\n", " \n", " # Cluster together values within the specified distance\n", " clusters = []\n", " start = 0\n", " # Lines less than this distance apart will be clustered\n", " distance = 10\n", " cluster = []\n", " \n", " # Loop through x values\n", " for x in x_values:\n", " \n", " # If the x value is less than the specified distance from the previous point,\n", " # we'll add it to the current cluster\n", " if x < start + distance:\n", " cluster.append(x)\n", " \n", " # If not we'll save the current cluster, and start a new one\n", " else:\n", " if cluster:\n", " # Add the current cluster to the list of clusters\n", " clusters.append(cluster)\n", " \n", " # Start a new cluster at the current point\n", " cluster = [x]\n", " \n", " # Set the current position\n", " start = x\n", " \n", " # Add the last cluster once we've finished the loop\n", " clusters.append(cluster)\n", "\n", " # Now we have a list of clustered x values\n", " # We'll compare nearby clusters and keep the ones with the most values (most likely to be columns)\n", " best_clusters = [[0]]\n", " \n", " # Loop through clusters\n", " for cluster in clusters:\n", " \n", " # If the current cluster is within 200px of the previous one\n", " if cluster[0] < best_clusters[-1][-1] + 200:\n", " \n", " # Check to see which cluster contains the most values\n", " # If it's the current one we'll add it to our best clusters\n", " if len(cluster) > len(best_clusters[-1]):\n", " \n", " # Remove the previous cluster from best clusters\n", " best_clusters.pop()\n", " \n", " # Add this one\n", " best_clusters.append(cluster)\n", " \n", " # If this cluster isn't near the previous one, add it to best clusters\n", " else:\n", " best_clusters.append(cluster)\n", " \n", " # print(best_clusters)\n", " \n", " # Now we have our best candidates for columns in best clusters\n", " # We'll do some further filtering by checking the clusters against our expectations of column positions\n", " # The pixel values used below are based on trial and error with the Stock Exchange images\n", " # Obviously if you were using this on other images you'd want to adjust them accordingly\n", " \n", " columns = []\n", " start = 0\n", " gutter = 0\n", " gap = None\n", " max_col_width = 2000\n", " \n", " # Loop through our best clusters\n", " for cluster in best_clusters:\n", " min_col_width = 950\n", " # If the leftmost point in this cluster is less than 600 then it's the gutter\n", " if cluster and cluster[0] < 600:\n", " \n", " # Set the gutter value to a mean of the clustered points\n", " gutter = mean(cluster)\n", " \n", " # Sometimes the gutter isn't detected, so we'll set a reasonable start position\n", " if gutter == 0:\n", " gutter = 200\n", " if gutter <= 200:\n", " start = 250\n", " else:\n", " start = gutter + 50\n", " if col_width:\n", " # print(col_width)\n", " #min_col_width = ((col_width - start) * 2) - 180\n", " #max_col_width = ((col_width - start) * 2) + 180\n", " min_col_width = max(min_col_width, int(round((col_width - start) * 1.65)))\n", " max_col_width = int(round((col_width - start) * 2.35))\n", " # print(min_col_width)\n", " # print(list(reversed(range(min_col_width, max_col_width + 100, 100))))\n", " else:\n", " # Checking the gap between this cluster and the previous one\n", " if gap:\n", " this_gap = gap\n", " else:\n", " \n", " # Current gap is the leftmost point of this cluster minus the previous column position\n", " this_gap = cluster[0] - start\n", " \n", " # This range represents approximate min/max column widths\n", " # We'll look for columns at 100 px intervals starting from the max value until we hit the min value\n", " for width in reversed(range(min_col_width, max_col_width + 100, 100)):\n", " cluster_mean = mean(cluster)\n", " # print(width)\n", " # print(cluster_mean)\n", " # print(start)\n", " # print(this_gap)\n", " # print('----')\n", " # Try to make sure columns are roughly the same width\n", " if (cluster_mean - start) > (this_gap - 500) and (cluster_mean - start) < (this_gap + 500):\n", " \n", " # If cluster falls within expected values, we'll assume it's a column\n", " if cluster and cluster_mean >= (start + width) and cluster_mean <= (w - 900) and this_gap < 2600:\n", "\n", " # Save mean of clustered values as column\n", " columns.append(cluster_mean)\n", " \n", " # Set the next start value to value of the last point in cluster\n", " start = cluster_mean\n", " gap = this_gap\n", " \n", " # Don't look for any more columns in this cluster\n", " break\n", " columns.append(w)\n", " return (gutter, columns)\n", "\n", "\n", "def resize(img, h, w):\n", " '''\n", " Resize image to a max width of 5000 px.\n", " '''\n", " # Find the scale to use, based on max width\n", " scale = 5000 / float(w)\n", " \n", " # Resize the image\n", " resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n", " return resized\n", "\n", "\n", "def save_header(img, header, w, image_name, output_dir):\n", " '''\n", " Save the detected header as a separate image.\n", " '''\n", " # Where to save the image\n", " header_dir = os.path.join(output_dir, 'headers')\n", " \n", " # Crop the image using header value\n", " # Numpy slicing - roi = im[y1:y2, x1:x2]\n", " header_img = img[0:header+20, 0:w]\n", " \n", " # Save the cropped image\n", " cv2.imwrite('{}/{}-header.jpg'.format(header_dir, image_name[:-4]), header_img)\n", "\n", " \n", "def save_columns(img, columns, header, h, image_name, output_dir):\n", " '''\n", " Save each detected column as a separate image.\n", " Note that the columns list should include the gutter at the beginning and the image width at the end.\n", " '''\n", " \n", " # Where to save the images\n", " col_dir = os.path.join(output_dir, 'columns')\n", " # Loop through the column values\n", " for index, column in enumerate(columns):\n", " \n", " # Get the value of the next column to use as the width of the cropped column\n", " try:\n", " next_col = columns[index+1]\n", " except IndexError:\n", " \n", " # If there's no next column we've reached the end of the image, so do nothing\n", " pass\n", " else:\n", " \n", " # Add a little to the margins of the image\n", " if column > 20:\n", " this_col = column - 20\n", " else:\n", " this_col = column\n", " \n", " # Crop the image to the dimensions of the column\n", " col_img = img[max(0, header-20):h, this_col:next_col]\n", " \n", " # Save the cropped image, using the undex value to denote column order\n", " cv2.imwrite('{}/{}-col-{}.jpg'.format(col_dir, image_name[:-4], index+1), col_img)\n", " \n", " \n", "def display_lines(image_name, output_dir, img, lines):\n", " '''\n", " For testing / debugging - shows ALL the detected lines\n", " '''\n", " for line in lines:\n", " #print(line)\n", " x1,y1,x2,y2 = line[0]\n", " cv2.line(img,(x1,y1),(x2,y2),(0,0,255),8)\n", " #cv2.imwrite('{}/{}-lines.jpg'.format(output_dir, image_name[:-4]), img)\n", " \n", "\n", "def process_image(image_name, image_path, output_dir='test', markup=False, grid=False, save_derivs=True):\n", " '''\n", " Detect columns and header in the supplied image.\n", " \n", " Parameters:\n", " image_name\n", " image_path\n", " output_dir (must exist)\n", " markup – if True, draw the results on the image, if False crop and save the detected regions.\n", " grid – if True, draw a grid on the image\n", " '''\n", " img = cv2.imread(image_path)\n", " \n", " # Get image dimensions\n", " try:\n", " h, w = img.shape[:2]\n", " \n", " # Weed out dodgy images\n", " except AttributeError:\n", " print('Not a valid image: {}'.format(image_path))\n", " \n", " # If it looks ok, then proceed...\n", " else:\n", " \n", " # To standardise things a little, we'll resize images with a width greater than 5000\n", " if w > 5000:\n", " img = resize(img, h, w)\n", " \n", " # Get the new dimensions\n", " h, w = img.shape[:2]\n", " \n", " # Detect stratight lines in the image\n", " lines = find_lines(img)\n", " \n", " # Use the detected lines to check for skew\n", " # I'm not actually sure if these deskewing steps are useful\n", " angle = check_for_skew(lines)\n", " \n", " # If image seems to be skewed, then deskew!\n", " if angle != 0:\n", " # print('Deskewing')\n", " img = deskew(img, angle)\n", " \n", " # Once deskewed we have to redo line detection because positions will have changed\n", " lines = find_lines(img)\n", " \n", " #display_lines(image_name, output_dir, img, lines)\n", " \n", " # Find the bottom of the header\n", " col_width, header = find_header(image_path)\n", " # print(col_width, header)\n", " \n", " # Filter the detected lines to identify columns\n", " gutter, columns = find_columns(lines, h, w, header, col_width)\n", " \n", " # Draw a grid on image (for debugging)\n", " if grid:\n", " img = add_grid(img)\n", " \n", " # Crop & save columns and header\n", " if save_derivs:\n", " # Crop and save header\n", " # save_header(img, header, w, image_name, output_dir)\n", " \n", " # Add gutter and page width to the columns list\n", " columns = [gutter] + columns\n", " \n", " # Crop and save columns\n", " save_columns(img, columns, header, h, image_name, output_dir)\n", " \n", " # Draw detected column & header lines on image and save the results (for testing)\n", " if markup:\n", " \n", " # Draw gutter \n", " cv2.line(img,(gutter,0),(gutter,h),(0,255,0),5)\n", " \n", " # Draw columns\n", " for column in columns:\n", " cv2.line(img,(column,0),(column,h),(0,255,0),5)\n", " \n", " # Draw header\n", " cv2.line(img,(0, header),(w, header),(255,0,0),3)\n", " \n", " # Save the annotated image\n", " cv2.imwrite('{}/{}.jpg'.format(output_dir, image_name[:-4]), img)\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Process a single image" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "process_image('testing.jpg', '/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-077/N193-077_0001.tif', 'testing', markup=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "find_header('/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-001/N193-001_0006.tif')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Process a directory" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Directory to process\n", "dir_path = '/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-001'\n", "\n", "# This is where the processed images should go\n", "output_dir = 'fullsize-processed/AU NBAC N193-001'\n", "os.makedirs(os.path.join(output_dir, 'columns'), exist_ok=True)\n", "os.makedirs(os.path.join(output_dir, 'headers'), exist_ok=True)\n", "\n", "# Loop through images with .tif(f) extension\n", "for img_name in tqdm([i for i in os.listdir(dir_path) if '.tif' in i[-5:].lower()]):\n", " # print(img_name)\n", " img_path = os.path.join(dir_path, img_name)\n", " try:\n", " process_image(img_name, img_path, output_dir, markup=True)\n", " except (TypeError, statistics.StatisticsError):\n", " print('ERROR')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Process lots of directories" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start_vol = 188\n", "# Directory of directories\n", "input_path = Path('/Volumes/Sydney Stock Exchange Vol 2/Sydney Stock Exchange Vol 2 110-199/')\n", "\n", "# This is where the processed images should go\n", "# output_path = Path('/Volumes/bigdata/mydata/stockexchange/processed')\n", "# output_path = 'fulltext-processed'\n", "# os.makedirs(os.path.join(output_dir, 'columns'), exist_ok=True)\n", "# os.makedirs(os.path.join(output_dir, 'headers'), exist_ok=True)\n", "\n", "# Loop through directories\n", "for img_dir in tqdm([d for d in input_path.glob('*') if d.is_dir()], desc='Directories'):\n", " # print(img_dir)\n", " vol_num = int(re.search(r'(\\d+)$', str(img_dir)).group(1))\n", " if vol_num >= start_vol:\n", " output_path = Path('/Volumes/bigdata/mydata/stockexchange/processed', img_dir.name.replace('Transferred ', ''))\n", " Path(output_path, 'columns').mkdir(parents=True, exist_ok=True)\n", " # Loop through images with .tif(f) extension\n", " for img_path in tqdm([i for i in img_dir.glob('*') if '.tif' in i.name[-5:].lower()], leave=False, desc='Images'):\n", " img_name = img_path.name\n", " # print(str(output_path))\n", " try:\n", " #find_lines(img, img_path)\n", " process_image(str(img_name), str(img_path), str(output_path), markup=True)\n", " except (TypeError, statistics.StatisticsError):\n", " pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Speed comparison\n", "\n", "For this to work on SWAN you need to comment out header detection as it needs a newer version of Tesseract to work properly.\n", "\n", "### Local" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "process_image('N193-022_0184.tif', '/Users/tim/Dropbox/working_code/stockexchange/src/notebooks/samples/AU-NBAC-N193-022/N193-022_0184.tif', 'data', markup=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%timeit\n", "process_image('N193-022_0184.tif', '/Users/tim/Dropbox/working_code/stockexchange/src/notebooks/samples/AU-NBAC-N193-022/N193-022_0184.tif', 'data', markup=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SWAN" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "process_image('N193-022_0184.tif', '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950/AU NBAC N193-007/N193-007_0012.tif', 'data', markup=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%timeit\n", "process_image('N193-022_0184.tif', '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950/AU NBAC N193-022/N193-022_0184.tif', 'data', markup=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }