{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Visualising DROID CSV\n", "=====================\n", "\n", "This is a quick experiment in visualising format results from DROID using a Jupyter Notebook.\n", "\n", "First we load in some example results...." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDPARENT_IDURIFILE_PATHNAMEMETHODSTATUSSIZETYPEEXTLAST_MODIFIEDEXTENSION_MISMATCHSHA1_HASHFORMAT_COUNTPUIDMIME_TYPEFORMAT_NAMEFORMAT_VERSION
020file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...format-corpusDoneFolder2014-02-28T15:49:11False
132file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...videoDoneFolder2014-02-28T15:48:47False
243file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...QuicktimeDoneFolder2014-02-28T15:48:59False
354file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...apple-intermediate-codec.movSignatureDone319539Filemov2014-02-18T16:58:16Falsed097cf36467373f52b974542d48bec134279fa3f1x-fmt/384video/quicktimeQuicktime
464file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...animation.movSignatureDone1020209Filemov2014-02-18T16:58:16Falseedb5226b963f449ce58054809149cb812bdf8c0a1x-fmt/384video/quicktimeQuicktime
.........................................................
394396395file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...InDesignDoneFolder2014-02-28T15:49:11False
395397396file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...Neddy_Flyer_ft_HeatherRyan.jpgSignatureDone1620612Filejpg2014-02-18T16:58:08False884de50cb1c052c0e10bef306850ee995d9651751fmt/41image/jpegRaw JPEG Stream
396398396file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...Neddy_Flyer_HeatherRyan.pdfSignatureDone59106Filepdf2014-02-18T16:58:08False9e19b76e8364c840945bc380ab5f98f00a23ab801fmt/17application/pdfAcrobat PDF 1.3 - Portable Document Format1.3
397399396file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...Neddy_Flyer_HeatherRyan.inddSignatureDone1503232Fileindd2014-02-18T16:58:08Falsed9211fe38e79f34fb7a043fe34df59527fb6e1791fmt/196Adobe InDesign DocumentCS
398400396file:////10.1.4.222/gda/archives-sample-files/...\\\\10.1.4.222\\gda\\archives-sample-files\\opf-for...Neddy_Flyer_README_HeatherRyan.md.rtfSignatureDone1210Filertf2014-02-18T16:58:08False3665b0c1457f996359939746752fe86f2025b68d1fmt/50application/rtf, text/rtfRich Text Format1.5-1.6
\n", "

399 rows × 18 columns

\n", "
" ], "text/plain": [ " ID PARENT_ID URI \\\n", "0 2 0 file:////10.1.4.222/gda/archives-sample-files/... \n", "1 3 2 file:////10.1.4.222/gda/archives-sample-files/... \n", "2 4 3 file:////10.1.4.222/gda/archives-sample-files/... \n", "3 5 4 file:////10.1.4.222/gda/archives-sample-files/... \n", "4 6 4 file:////10.1.4.222/gda/archives-sample-files/... \n", ".. ... ... ... \n", "394 396 395 file:////10.1.4.222/gda/archives-sample-files/... \n", "395 397 396 file:////10.1.4.222/gda/archives-sample-files/... \n", "396 398 396 file:////10.1.4.222/gda/archives-sample-files/... \n", "397 399 396 file:////10.1.4.222/gda/archives-sample-files/... \n", "398 400 396 file:////10.1.4.222/gda/archives-sample-files/... \n", "\n", " FILE_PATH \\\n", "0 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "1 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "2 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "3 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "4 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", ".. ... \n", "394 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "395 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "396 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "397 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "398 \\\\10.1.4.222\\gda\\archives-sample-files\\opf-for... \n", "\n", " NAME METHOD STATUS SIZE TYPE \\\n", "0 format-corpus Done Folder \n", "1 video Done Folder \n", "2 Quicktime Done Folder \n", "3 apple-intermediate-codec.mov Signature Done 319539 File \n", "4 animation.mov Signature Done 1020209 File \n", ".. ... ... ... ... ... \n", "394 InDesign Done Folder \n", "395 Neddy_Flyer_ft_HeatherRyan.jpg Signature Done 1620612 File \n", "396 Neddy_Flyer_HeatherRyan.pdf Signature Done 59106 File \n", "397 Neddy_Flyer_HeatherRyan.indd Signature Done 1503232 File \n", "398 Neddy_Flyer_README_HeatherRyan.md.rtf Signature Done 1210 File \n", "\n", " EXT LAST_MODIFIED EXTENSION_MISMATCH \\\n", "0 2014-02-28T15:49:11 False \n", "1 2014-02-28T15:48:47 False \n", "2 2014-02-28T15:48:59 False \n", "3 mov 2014-02-18T16:58:16 False \n", "4 mov 2014-02-18T16:58:16 False \n", ".. ... ... ... \n", "394 2014-02-28T15:49:11 False \n", "395 jpg 2014-02-18T16:58:08 False \n", "396 pdf 2014-02-18T16:58:08 False \n", "397 indd 2014-02-18T16:58:08 False \n", "398 rtf 2014-02-18T16:58:08 False \n", "\n", " SHA1_HASH FORMAT_COUNT PUID \\\n", "0 \n", "1 \n", "2 \n", "3 d097cf36467373f52b974542d48bec134279fa3f 1 x-fmt/384 \n", "4 edb5226b963f449ce58054809149cb812bdf8c0a 1 x-fmt/384 \n", ".. ... ... ... \n", "394 \n", "395 884de50cb1c052c0e10bef306850ee995d965175 1 fmt/41 \n", "396 9e19b76e8364c840945bc380ab5f98f00a23ab80 1 fmt/17 \n", "397 d9211fe38e79f34fb7a043fe34df59527fb6e179 1 fmt/196 \n", "398 3665b0c1457f996359939746752fe86f2025b68d 1 fmt/50 \n", "\n", " MIME_TYPE FORMAT_NAME \\\n", "0 \n", "1 \n", "2 \n", "3 video/quicktime Quicktime \n", "4 video/quicktime Quicktime \n", ".. ... ... \n", "394 \n", "395 image/jpeg Raw JPEG Stream \n", "396 application/pdf Acrobat PDF 1.3 - Portable Document Format \n", "397 Adobe InDesign Document \n", "398 application/rtf, text/rtf Rich Text Format \n", "\n", " FORMAT_VERSION \n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", ".. ... \n", "394 \n", "395 \n", "396 1.3 \n", "397 CS \n", "398 1.5-1.6 \n", "\n", "[399 rows x 18 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from io import StringIO\n", "import pandas as pd\n", "import requests\n", "\n", "url = 'https://raw.githubusercontent.com/exponential-decay/demystify/master/opf-test-corpus-test-output/opf-test-corpus-droid-analysis.csv'\n", "s=requests.get(url).text\n", "\n", "df=pd.read_csv(StringIO(s), keep_default_na=False)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we have the data, we can explore ways to visualise it.\n", "\n", "Here's a simple bar chart of all the different types and PUIDs..." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import altair as alt\n", "\n", "alt.Chart(df).mark_bar().encode(\n", " x=alt.X('PUID', sort='-y'),\n", " y='count()',\n", " color='TYPE',\n", " tooltip=['TYPE','PUID', 'FORMAT_NAME', 'FORMAT_VERSION', 'count()']\n", ").interactive()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively, we can group together the different MIME types, and use the colours to show the various PUIDs associated with each..." ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(df).mark_bar().encode(\n", " x=alt.X('MIME_TYPE', sort='-y'),\n", " y='count()',\n", " color=alt.Color('PUID', legend=None),\n", " tooltip=['TYPE','MIME_TYPE', 'PUID', 'FORMAT_NAME', 'FORMAT_VERSION', 'count()']\n", ").interactive()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }