{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Process images by year\n", "\n", "Creates individual CSV files for each year that contain image dimensions and computed number of columns for each file.\n", "\n", "Use on SWAN to run column detection across all of the images.\n", "\n", "This notebook generates CSV files for each year with summary results.\n", "\n", "Fields in the CSV files:\n", "\n", "* `directory`\n", "* `name`\n", "* `path`\n", "* `referenceCode`\n", "* `startDate`\n", "* `endDate`\n", "* `year`\n", "* `width` – width of the image\n", "* `height` – height of the image\n", "* `columns` – number of columns\n", "* `column_positions` – comma-separated string of x values of columns in pixels" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import cv2\n", "import math\n", "import statistics\n", "from tqdm import tqdm\n", "import os\n", "from IPython.display import display, HTML\n", "try:\n", " from PIL import Image\n", "except ImportError:\n", " import Image\n", "import pytesseract\n", "from statistics import mean\n", "import re" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# COPIED FROM MAIN PROCESSING NOTEBOOK\n", "\n", "def find_lines(img):\n", " '''\n", " Find straight lines in an image.\n", " Returns a list of lines.\n", " \n", " These settings have been arrived at after much trial and error.\n", " '''\n", " # Convert to grayscale\n", " gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n", " \n", " # Theshold image (convert to black and white)\n", " retval, th = cv2.threshold(gray,125,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)\n", " # cv2.imwrite('data/th.jpg',th)\n", " \n", " # Use median blur to get rid of a lot of the text\n", " median = cv2.medianBlur(th, 11)\n", " # cv2.imwrite('data/median.jpg',median)\n", " \n", " # Try to strengthen the remaining lines\n", " kernel = np.ones((5,5),np.uint8)\n", " opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel, iterations=1)\n", " # cv2.imwrite('data/opened.jpg',opened)\n", "\n", " # Find the edges of the remaining shapes\n", " edges = cv2.Canny(opened,50,150,apertureSize=3)\n", " # cv2.imwrite('data/edges.jpg',edges)\n", " \n", " # Find straight lines in the edges\n", " # Note that the minLineLength and maxLineGap values can have a dramatic effect on the number of lines detected.\n", " # Note too that what looks to the human eye as a single straight line,\n", " # can actually be a series of short line with tiny gaps between them,\n", " # so while increasing the minLineLength reduces noise from text, it can also filter out columns.\n", " lines = cv2.HoughLinesP(image=edges,rho=1,theta=np.pi/180, threshold=200,lines=np.array([]), minLineLength=100,maxLineGap=100)\n", " return lines\n", "\n", "\n", "def find_header(img):\n", " '''\n", " Locate the page header, and return the y position of the bottom.\n", " Note that in this case using OCR seemed easier than trying to sort/cluster lines.\n", " '''\n", " \n", " # Image dimensions\n", " (h, w) = img.shape[:2]\n", " points = []\n", " \n", " # The header will always be at the top, so crop off the top of the image, rather than OCRing the whole thing\n", " cropped = img[0:round(h/4), 0:round(w-(w/4))]\n", " \n", " # I should probably do some image preprocessing here (see row detection notebook)\n", " # Get OCR results\n", " results = pytesseract.image_to_data(cropped, output_type=pytesseract.Output.DICT)\n", " \n", " # Loop through the OCR results looking for specific words that appear in the header\n", " for index, word in enumerate(results['text']):\n", " # These are the words that appear in the header\n", " # If we find one, add its 'top' value to our list of points\n", " if re.search(r'Shares|Quotations|Buyers|Sellers|Business|Done|Closing prices|Sales', word, flags=re.IGNORECASE) and results['height'][index] < 50:\n", " # y = results['top'][index]\n", " points.append(results['top'][index])\n", " # y = round(mean(points))\n", " # print(points)\n", " \n", " try:\n", " # Sort points and get the first\n", " y = sorted(points)[0]\n", " except IndexError:\n", " # If we can't find a header return 0\n", " y = 0\n", " \n", " # Return y value of header\n", " return y\n", " \n", "\n", "def check_for_skew(lines):\n", " '''\n", " Check for skewing by looking at the near vertical lines detected in the image.\n", " '''\n", " angles = []\n", " \n", " # Loop through detected lines\n", " for line in lines:\n", " # Get coords of line\n", " for x1,y1,x2,y2 in line:\n", " # Ignore short lines and lines in header\n", " if abs(y1 - y2) > 150 and x1 > 300:\n", " # Get the angle of the line\n", " if y2 > y1:\n", " radians = math.atan2((y2 - y1), (x2 - x1))\n", " else:\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = math.degrees(radians)\n", " # print(degrees)\n", " # If it's vertical-ish, save this angle\n", " if degrees >= 80 and degrees <= 100:\n", " angles.append(degrees)\n", " # Get the media of the saved angles\n", " angle = statistics.median(angles) - 90\n", " # print(angle)\n", " return angle\n", "\n", "\n", "def deskew(img, angle):\n", " '''\n", " Deskew image by rotating it by the supplied angle.\n", " '''\n", " # Get image dimensions\n", " (h, w) = img.shape[:2]\n", " \n", " # Get the centre of the image\n", " center = (w // 2, h // 2)\n", " \n", " # Rotate image by angle\n", " M = cv2.getRotationMatrix2D(center, angle, 1.0)\n", " rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n", " \n", " # Return the rotated image\n", " return rotated\n", "\n", "\n", "def add_grid(img):\n", " '''\n", " Draws a 100 x 100px grid on image.\n", " Can be useful in interpreting column detection results.\n", " '''\n", " h, w = img.shape[:2]\n", " for x in range(0, w, 100):\n", " cv2.line(img,(x,0),(x,h),(255,0,0),1) \n", " for y in range(0, h, 100):\n", " cv2.line(img,(0,y),(w,y),(255,0,0),1)\n", " return img\n", "\n", "\n", "def find_top(lines):\n", " '''\n", " Use horizontal lines near the top of the page to provide an approximation of the header height.\n", " Used to crop page to ignore lines in header area.\n", " More accurate header location is found using Tesseract.\n", " '''\n", " top = 0\n", " y_values = []\n", " for line in lines:\n", " for x1,y1,x2,y2 in line:\n", " bottom = y1 if y1 > y2 else y2\n", " if bottom < 1000:\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = math.degrees(radians)\n", " if degrees > 179 and degrees < 181:\n", " y_values.append(bottom)\n", " if y_values:\n", " top = sorted(y_values)[-1]\n", " return top \n", "\n", "\n", "def find_columns(lines, h, w):\n", " '''\n", " Identifies most likely column values from within the set of straight lines in an image.\n", " This could do with some cleaning up, but it's working well at the moment, so I don't really want to fiddle any more.\n", " Note that this does depend on some knowledge of the images to define ranges of expected values.\n", " '''\n", " x_values = []\n", " \n", " # Get the approximate position of the header so we can ignore lines above this\n", " column_top = find_top(lines)\n", "\n", " # Find the x values of vertical lines\n", " for line in lines:\n", " for x1,y1,x2,y2 in line:\n", " \n", " # Find the top\n", " top = y1 if y1 < y2 else y2\n", " \n", " # Ignore column lines at the top & bottom of the image\n", " if top > column_top and top < (h - 600):\n", "\n", " # Find the leftmost point\n", " first = x1 if x1 < x2 else x2\n", " \n", " # Find the angle of the line\n", " radians = math.atan2((y1 - y2), (x1 - x2))\n", " degrees = abs(math.degrees(radians))\n", " \n", " # If the line is (close to) vertical, we'll save the left-most x value\n", " if degrees >= 89 and degrees <= 91:\n", " x_values.append(first)\n", " \n", " # Sort the x_values\n", " x_values = sorted(x_values)\n", " \n", " # Cluster together values within the specified distance\n", " clusters = []\n", " start = 0\n", " # Lines less than this distance apart will be clustered\n", " distance = 10\n", " cluster = []\n", " \n", " # Loop through x values\n", " for x in x_values:\n", " \n", " # If the x value is less than the specified distance from the previous point,\n", " # we'll add it to the current cluster\n", " if x < start + distance:\n", " cluster.append(x)\n", " \n", " # If not we'll save the current cluster, and start a new one\n", " else:\n", " if cluster:\n", " # Add the current cluster to the list of clusters\n", " clusters.append(cluster)\n", " \n", " # Start a new cluster at the current point\n", " cluster = [x]\n", " \n", " # Set the current position\n", " start = x\n", " \n", " # Add the last cluster once we've finished the loop\n", " clusters.append(cluster)\n", "\n", " # Now we have a list of clustered x values\n", " # We'll compare nearby clusters and keep the ones with the most values (most likely to be columns)\n", " best_clusters = [[0]]\n", " \n", " # Loop through clusters\n", " for cluster in clusters:\n", " \n", " # If the current cluster is within 200px of the previous one\n", " if cluster[0] < best_clusters[-1][-1] + 200:\n", " \n", " # Check to see which cluster contains the most values\n", " # If it's the current one we'll add it to our best clusters\n", " if len(cluster) > len(best_clusters[-1]):\n", " \n", " # Remove the previous cluster from best clusters\n", " best_clusters.pop()\n", " \n", " # Add this one\n", " best_clusters.append(cluster)\n", " \n", " # If this cluster isn't near the previous one, add it to best clusters\n", " else:\n", " best_clusters.append(cluster)\n", " \n", " # Now we have our best candidates for columns in best clusters\n", " # We'll do some further filtering by checking the clusters against our expectations of column positions\n", " # The pixel values used below are based on trial and error with the Stock Exchange images\n", " # Obviously if you were using this on other images you'd want to adjust them accordingly\n", " \n", " columns = []\n", " start = 0\n", " gutter = 0\n", " gap = None\n", " \n", " # Loop through our best clusters\n", " for cluster in best_clusters:\n", " \n", " # If the leftmost point in this cluster is less than 600 then it's the gutter\n", " if cluster and cluster[0] < 600:\n", " \n", " # Set the gutter value to a mean of the clustered points\n", " gutter = mean(cluster)\n", " \n", " # Sometimes the gutter isn't detected, so we'll set a reasonable start position\n", " if gutter < 100:\n", " start = 100\n", " else:\n", " start = gutter\n", " else:\n", " # Checking the gap between this cluster and the previous one\n", " if gap:\n", " this_gap = gap\n", " else:\n", " \n", " # Current gap is the leftmost point of this cluster minus the previous column position\n", " this_gap = cluster[0] - start\n", " \n", " # This range represents approximate min/max column widths\n", " # We'll look for columns at 100 px intervals starting from the max value until we hit the min value\n", " for width in reversed(range(900, 2000, 100)):\n", " \n", " # Try to make sure columns are roughly the same width\n", " if (cluster[0] - start) > (this_gap - 600) and (cluster[0] - start) < (this_gap + 600):\n", " \n", " # If cluster falls within expected values, we'll assume it's a column\n", " if cluster and cluster[0] > (start + width) and cluster[0] < (w - 600) and this_gap < 2600:\n", " \n", " # Save mean of clustered values as column\n", " columns.append(mean(cluster))\n", " \n", " # Set the next start value to value of the last point in cluster\n", " start = cluster[-1]\n", " gap = this_gap\n", " \n", " # Don't look for any more columns in this cluster\n", " break\n", " #print(gutter)\n", " #print(columns)\n", " return (gutter, columns)\n", "\n", "\n", "def resize(img, h, w):\n", " '''\n", " Resize image to a max width of 5000 px.\n", " '''\n", " # Find the scale to use, based on max width\n", " scale = 5000 / float(w)\n", " \n", " # Resize the image\n", " resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)\n", " return resized\n", "\n", "\n", "def save_header(img, header, w, image_name, output_dir):\n", " '''\n", " Save the detected header as a separate image.\n", " '''\n", " # Where to save the image\n", " header_dir = os.path.join(output_dir, 'headers')\n", " \n", " # Crop the image using header value\n", " # Numpy slicing - roi = im[y1:y2, x1:x2]\n", " header_img = img[0:header+20, 0:w]\n", " \n", " # Save the cropped image\n", " cv2.imwrite('{}/{}-header.jpg'.format(header_dir, image_name[:-4]), header_img)\n", "\n", " \n", "def save_columns(img, columns, header, h, image_name, output_dir):\n", " '''\n", " Save each detected column as a separate image.\n", " Note that the columns list should include the gutter at the beginning and the image width at the end.\n", " '''\n", " \n", " # Where to save the images\n", " col_dir = os.path.join(output_dir, 'columns')\n", " \n", " # Loop through the column values\n", " for index, column in enumerate(columns):\n", " \n", " # Get the value of the next column to use as the width of the cropped column\n", " try:\n", " next_col = columns[index+1]\n", " except IndexError:\n", " \n", " # If there's no next column we've reached the end of the image, so do nothing\n", " pass\n", " else:\n", " \n", " # Add a little to the margins of the image\n", " if column > 20:\n", " this_col = column - 20\n", " else:\n", " this_col = column\n", " \n", " # Crop the image to the dimensions of the column\n", " col_img = img[header-20:h, this_col:next_col]\n", " \n", " # Save the cropped image, using the undex value to denote column order\n", " cv2.imwrite('{}/{}-col-{}.jpg'.format(col_dir, image_name[:-4], index+1), col_img)\n", " \n", " \n", "def display_lines(image_name, output_dir, img, lines):\n", " '''\n", " For testing / debugging - shows ALL the detected lines\n", " '''\n", " for line in lines:\n", " #print(line)\n", " x1,y1,x2,y2 = line[0]\n", " cv2.line(img,(x1,y1),(x2,y2),(0,0,255),8)\n", " #cv2.imwrite('{}/{}-lines.jpg'.format(output_dir, image_name[:-4]), img)\n", " \n", "\n", "def process_image(image_name, image_path, output_dir='test', markup=False, grid=False):\n", " '''\n", " Detect columns and header in the supplied image.\n", " \n", " Parameters:\n", " image_name\n", " image_path\n", " output_dir (must exist)\n", " markup – if True, draw the results on the image, if False crop and save the detected regions.\n", " grid – if True, draw a grid on the image\n", " '''\n", " \n", " img = cv2.imread(image_path)\n", " \n", " # Get image dimensions\n", " try:\n", " h, w = img.shape[:2]\n", " \n", " # Weed out dodgy images\n", " except AttributeError:\n", " print('Not a valid image: {}'.format(image_path))\n", " \n", " # If it looks ok, then proceed...\n", " else:\n", " \n", " # To standardise things a little, we'll resize images with a width greater than 5000\n", " if w > 5000:\n", " img = resize(img, h, w)\n", " \n", " # Get the new dimensions\n", " h, w = img.shape[:2]\n", " \n", " # Detect stratight lines in the image\n", " lines = find_lines(img)\n", " \n", " # Use the detected lines to check for skew\n", " # I'm not actually sure if these deskewing steps are useful\n", " angle = check_for_skew(lines)\n", " \n", " # If image seems to be skewed, then deskew!\n", " if angle != 0:\n", " # print('Deskewing')\n", " img = deskew(img, angle)\n", " \n", " # Once deskewed we have to redo line detection because positions will have changed\n", " lines = find_lines(img)\n", " \n", " #display_lines(image_name, output_dir, img, lines)\n", " \n", " # Filter the detected lines to identify columns\n", " gutter, columns = find_columns(lines, h, w)\n", " \n", " # Find the bottom of the header\n", " header = find_header(img)\n", " \n", " # Draw a grid on image (for debugging)\n", " if grid:\n", " img = add_grid(img)\n", " \n", " # Draw detected column & header lines on image and save the results (for testing)\n", " if markup:\n", " \n", " # Draw gutter \n", " cv2.line(img,(gutter,0),(gutter,h),(0,255,0),5)\n", " \n", " # Draw columns\n", " for column in columns:\n", " cv2.line(img,(column,0),(column,h),(0,255,0),5)\n", " \n", " # Draw header\n", " cv2.line(img,(0, header),(w, header),(255,0,0),3)\n", " \n", " # Save the annotated image\n", " cv2.imwrite('{}/{}.jpg'.format(output_dir, image_name[:-4]), img)\n", " \n", " # Otherwise crop & save columns and header\n", " else:\n", " # Crop and save header\n", " save_header(img, header, w, image_name, output_dir)\n", " \n", " # Add gutter and page width to the columns list\n", " columns = [gutter] + columns + [w] \n", " \n", " # Crop and save columns\n", " save_columns(img, columns, header, h, image_name, output_dir)\n", "\n", "\n", "# ADDED\n", "\n", "def get_image_params(image_path):\n", " '''\n", " Opens an image and looks for columns.\n", " Returns image dimensions and number of columns as Pandas Sesries.\n", " '''\n", " # Changed to point to absolute path\n", " img = cv2.imread(os.path.join(os.sep, 'webdav', image_path))\n", " # This is just to weed out dodgy images\n", " try:\n", " h, w = img.shape[:2]\n", " except AttributeError:\n", " print('Not a valid image')\n", " new_h = 0\n", " new_w = 0\n", " columns = []\n", " column_positions = ''\n", " else:\n", " if w > 5000:\n", " img = resize(img, h, w)\n", " new_h, new_w = img.shape[:2]\n", " else:\n", " new_h = h\n", " new_w = w\n", " lines = find_lines(img)\n", " angle = check_for_skew(lines)\n", " if angle != 0:\n", " img = deskew(img, angle)\n", " lines = find_lines(img)\n", " gutter, columns = find_columns(lines, new_h, new_w)\n", " columns.insert(0, gutter)\n", " column_positions = ','.join([str(c) for c in columns])\n", " return pd.Series([new_h, new_w, len(columns), column_positions])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def data_by_year(start=1901):\n", " '''\n", " Split the big file list up by year, add image/column details, then save each year as a new CSV.\n", " '''\n", " df = pd.read_csv('files_with_dates.csv')\n", " years = df.groupby('year')\n", " for year, files in years:\n", " if int(year) >= start:\n", " print(year)\n", " files[['width', 'height', 'columns', 'column_positions']] = files.progress_apply(lambda x: get_image_params(x['path']), axis=1)\n", " files.to_csv('{}.csv'.format(year), index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "tqdm.pandas(desc='Images', leave=False)\n", "data_by_year(1950)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }