{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Finding all the faces in the Tribune collection\n", "\n", "This notebook runs a facial detection script across the whole Tribune collection. It saves cropped versions of all the detected faces, and creates a data file recording the number of faces detected per image." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

If you haven't used one of these notebooks before, they're basically web pages in which you can write, edit, and run live code. They're meant to encourage experimentation, so don't feel nervous. Just try running a few cells and see what happens!.

\n", "

\n", " Some tips:\n", "

\n", "

\n", "
" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import cv2\n", "import pandas as pd\n", "import os\n", "from urllib.parse import urlparse\n", "import requests\n", "from IPython.display import display, HTML\n", "import copy\n", "from tqdm import tqdm_notebook\n", "import altair as alt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load Tribune images data\n", "df = pd.read_csv('https://raw.githubusercontent.com/GLAM-Workbench/ozglam-data-records-of-resistance/master/data/images.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Link to the facial detection data file\n", "face_cl = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')\n", "\n", "\n", "def download_image(img_url):\n", " '''\n", " Download and save the specified image.\n", " '''\n", " current_dir = os.getcwd()\n", " os.makedirs(os.path.join(current_dir, 'downloads'), exist_ok=True)\n", " parsed = urlparse(img_url)\n", " filename = os.path.join(current_dir, 'downloads', os.path.basename(parsed.path))\n", " response = requests.get(img_url, stream=True)\n", " with open(filename, 'wb') as fd:\n", " for chunk in response.iter_content(chunk_size=128):\n", " fd.write(chunk)\n", " return filename \n", "\n", "\n", "def detect_faces(img_file, save_annotated=True):\n", " '''\n", " Use OpenCV to find faces.\n", " '''\n", " faces = []\n", " f = 1\n", " os.makedirs('faces', exist_ok=True)\n", " # print('Processing {}'.format(img_file))\n", " try:\n", " image = cv2.imread(img_file)\n", " # Create a copy to annotate\n", " results = image.copy()\n", " # Create a greyscale copy for face detection\n", " grey = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)\n", " # Find faces!\n", " # Try adjusting scaleFactor and minNeighbors if results aren't what you expect.\n", " faces = face_cl.detectMultiScale(grey, scaleFactor=1.3, minNeighbors=4, minSize=(50, 50))\n", " except cv2.error:\n", " raise\n", " else:\n", " for (x, y, w, h) in faces:\n", " # Save a cropped version of the detected face\n", " face = image[y: y + h, x: x + w]\n", " cv2.imwrite(os.path.join('faces', '{}-{}.jpg'.format(os.path.splitext(os.path.basename(img_file))[0], f)), face)\n", " # Draw a green box on the complete image\n", " cv2.rectangle(results, (x, y), (x + w, y + h), (0, 255, 0), 2)\n", " f += 1\n", " # Save the annotated image\n", " if save_annotated:\n", " cv2.imwrite(img_file, results)\n", " return faces\n", "\n", "\n", "def process_images(images):\n", " '''\n", " Find faces in a list of images.\n", " Displays the results\n", " '''\n", " for img_id, img_url in images:\n", " filename = download_image(img_url)\n", " faces = detect_faces(filename)\n", " html = '
More details at SLNSW
'.format(os.path.basename(filename), img_id)\n", " print('I found {} faces...'.format(len(faces)))\n", " for i, face in enumerate(faces, 1):\n", " html += ''.format(img_id, i)\n", " display(HTML(html))\n", " \n", " \n", "def get_image_by_id(img_id):\n", " '''\n", " Process a specific image.\n", " '''\n", " images = [(img_id, 'https://s3-ap-southeast-2.amazonaws.com/wraggetribune/images/{0}.jpg'.format(img_id))]\n", " process_images(images)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def download_and_detect():\n", " '''\n", " Download ALL THE IMAGES and look for faces.\n", " This will take a long time and use up lots of disk space.\n", " You can just take a slice of the images list to test it out.\n", " '''\n", " face_data = []\n", " images = df['images'].tolist()\n", " for image_id in tqdm_notebook(images):\n", " img_url = 'https://s3-ap-southeast-2.amazonaws.com/wraggetribune/images/{0}.jpg'.format(img_id)\n", " img_file = download_image(img_file)\n", " faces = detect_faces(img_file, save_annotated=False)\n", " face_data.append({'image': image, 'faces': len(faces)})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def detect_all():\n", " '''\n", " I've already got copies of all the images, so I'll just point the script at them.\n", " '''\n", " face_data = []\n", " image_dir = '/Volumes/bigdata/mydata/SLNSW/Tribune/images'\n", " images = [i for i in os.listdir(image_dir) if i[-4:] == '.jpg']\n", " for image in tqdm_notebook(images):\n", " img_file = os.path.join(image_dir, image)\n", " faces = detect_faces(img_file, save_annotated=False)\n", " face_data.append({'image': image, 'faces': len(faces)})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# FIND ALL THE FACES\n", "detect_all()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Convert to a dataframe and save as CSV\n", "df = pd.DataFrame(face_data)\n", "df.to_csv('faces_per_image.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.to_csv('faces_per_image.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }