{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Finding all the faces in the Tribune collection\n", "\n", "This notebook runs a facial detection script across the whole Tribune collection. It saves cropped versions of all the detected faces, and creates a data file recording the number of faces detected per image." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import cv2\n", "import pandas as pd\n", "import os\n", "from urllib.parse import urlparse\n", "import requests\n", "from IPython.display import display, HTML\n", "import copy\n", "from tqdm import tqdm_notebook\n", "import altair as alt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load Tribune images data\n", "df = pd.read_csv('https://raw.githubusercontent.com/GLAM-Workbench/ozglam-data-records-of-resistance/master/data/images.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Link to the facial detection data file\n", "face_cl = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')\n", "\n", "\n", "def download_image(img_url):\n", " '''\n", " Download and save the specified image.\n", " '''\n", " current_dir = os.getcwd()\n", " os.makedirs(os.path.join(current_dir, 'downloads'), exist_ok=True)\n", " parsed = urlparse(img_url)\n", " filename = os.path.join(current_dir, 'downloads', os.path.basename(parsed.path))\n", " response = requests.get(img_url, stream=True)\n", " with open(filename, 'wb') as fd:\n", " for chunk in response.iter_content(chunk_size=128):\n", " fd.write(chunk)\n", " return filename \n", "\n", "\n", "def detect_faces(img_file, save_annotated=True):\n", " '''\n", " Use OpenCV to find faces.\n", " '''\n", " faces = []\n", " f = 1\n", " os.makedirs('faces', exist_ok=True)\n", " # print('Processing {}'.format(img_file))\n", " try:\n", " image = cv2.imread(img_file)\n", " # Create a copy to annotate\n", " results = image.copy()\n", " # Create a greyscale copy for face detection\n", " grey = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)\n", " # Find faces!\n", " # Try adjusting scaleFactor and minNeighbors if results aren't what you expect.\n", " faces = face_cl.detectMultiScale(grey, scaleFactor=1.3, minNeighbors=4, minSize=(50, 50))\n", " except cv2.error:\n", " raise\n", " else:\n", " for (x, y, w, h) in faces:\n", " # Save a cropped version of the detected face\n", " face = image[y: y + h, x: x + w]\n", " cv2.imwrite(os.path.join('faces', '{}-{}.jpg'.format(os.path.splitext(os.path.basename(img_file))[0], f)), face)\n", " # Draw a green box on the complete image\n", " cv2.rectangle(results, (x, y), (x + w, y + h), (0, 255, 0), 2)\n", " f += 1\n", " # Save the annotated image\n", " if save_annotated:\n", " cv2.imwrite(img_file, results)\n", " return faces\n", "\n", "\n", "def process_images(images):\n", " '''\n", " Find faces in a list of images.\n", " Displays the results\n", " '''\n", " for img_id, img_url in images:\n", " filename = download_image(img_url)\n", " faces = detect_faces(filename)\n", " html = '
'.format(os.path.basename(filename), img_id)\n", " print('I found {} faces...'.format(len(faces)))\n", " for i, face in enumerate(faces, 1):\n", " html += ''.format(img_id, i)\n", " display(HTML(html))\n", " \n", " \n", "def get_image_by_id(img_id):\n", " '''\n", " Process a specific image.\n", " '''\n", " images = [(img_id, 'https://s3-ap-southeast-2.amazonaws.com/wraggetribune/images/{0}.jpg'.format(img_id))]\n", " process_images(images)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def download_and_detect():\n", " '''\n", " Download ALL THE IMAGES and look for faces.\n", " This will take a long time and use up lots of disk space.\n", " You can just take a slice of the images list to test it out.\n", " '''\n", " face_data = []\n", " images = df['images'].tolist()\n", " for image_id in tqdm_notebook(images):\n", " img_url = 'https://s3-ap-southeast-2.amazonaws.com/wraggetribune/images/{0}.jpg'.format(img_id)\n", " img_file = download_image(img_file)\n", " faces = detect_faces(img_file, save_annotated=False)\n", " face_data.append({'image': image, 'faces': len(faces)})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def detect_all():\n", " '''\n", " I've already got copies of all the images, so I'll just point the script at them.\n", " '''\n", " face_data = []\n", " image_dir = '/Volumes/bigdata/mydata/SLNSW/Tribune/images'\n", " images = [i for i in os.listdir(image_dir) if i[-4:] == '.jpg']\n", " for image in tqdm_notebook(images):\n", " img_file = os.path.join(image_dir, image)\n", " faces = detect_faces(img_file, save_annotated=False)\n", " face_data.append({'image': image, 'faces': len(faces)})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# FIND ALL THE FACES\n", "detect_all()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Convert to a dataframe and save as CSV\n", "df = pd.DataFrame(face_data)\n", "df.to_csv('faces_per_image.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.to_csv('faces_per_image.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }