{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pickle\n", "import nltk\n", "\n", "stats = pickle.load( open( \"/media/storage/dpla-data/pickles/new/newstats.p\", \"rb\" ) )\n", "common = pickle.load( open( \"/media/storage/dpla-data/pickles/new/common.p\", \"rb\" ) )\n", "searcom = pickle.load( open( \"/media/storage/dpla-data/pickles/new/sear_common.p\", \"rb\" ) )\n", "searfilt = pickle.load(open( \"/media/storage/dpla-data/pickles/new/searches_filtered.p\", \"rb\" ) ) " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'artstor': {'funiq': 60168,\n", " 'fwc': 5025070,\n", " 'haps': 29757,\n", " 'lowerhaps': 24103,\n", " 'uniq': 60293,\n", " 'wc': 6972534},\n", " 'biodiv': {'funiq': 94248,\n", " 'fwc': 5658739,\n", " 'haps': 44804,\n", " 'lowerhaps': 38471,\n", " 'uniq': 94372,\n", " 'wc': 6381376},\n", " 'commonwealth': {'funiq': 204577,\n", " 'fwc': 11348522,\n", " 'haps': 159095,\n", " 'lowerhaps': 154009,\n", " 'uniq': 204703,\n", " 'wc': 14022356},\n", " 'georgia': {'funiq': 150863,\n", " 'fwc': 32656431,\n", " 'haps': 89668,\n", " 'lowerhaps': 79492,\n", " 'uniq': 150990,\n", " 'wc': 42031491},\n", " 'getty': {'funiq': 54355,\n", " 'fwc': 14251103,\n", " 'haps': 11663,\n", " 'lowerhaps': 9767,\n", " 'uniq': 54474,\n", " 'wc': 18732730},\n", " 'gpo': {'funiq': 437646,\n", " 'fwc': 21860075,\n", " 'haps': 351637,\n", " 'lowerhaps': 343619,\n", " 'uniq': 437770,\n", " 'wc': 26316103},\n", " 'harvard': {'funiq': 35918,\n", " 'fwc': 849987,\n", " 'haps': 20447,\n", " 'lowerhaps': 18025,\n", " 'uniq': 36036,\n", " 'wc': 968898},\n", " 'ia': {'funiq': 502974,\n", " 'fwc': 16996418,\n", " 'haps': 394559,\n", " 'lowerhaps': 378206,\n", " 'uniq': 503101,\n", " 'wc': 23288038},\n", " 'illinois': {'funiq': 49018,\n", " 'fwc': 1829267,\n", " 'haps': 29755,\n", " 'lowerhaps': 23849,\n", " 'uniq': 49143,\n", " 'wc': 2385501},\n", " 'kentucky': {'funiq': 30374,\n", " 'fwc': 6800530,\n", " 'haps': 14090,\n", " 'lowerhaps': 11338,\n", " 'uniq': 30498,\n", " 'wc': 9405279},\n", " 'minnesota': {'funiq': 43666,\n", " 'fwc': 3598870,\n", " 'haps': 21112,\n", " 'lowerhaps': 17674,\n", " 'uniq': 43791,\n", " 'wc': 4495075},\n", " 'missouri': {'funiq': 119586,\n", " 'fwc': 3542143,\n", " 'haps': 90859,\n", " 'lowerhaps': 85222,\n", " 'uniq': 119713,\n", " 'wc': 4256929},\n", " 'mwdl': {'funiq': 793849,\n", " 'fwc': 87424176,\n", " 'haps': 542772,\n", " 'lowerhaps': 504876,\n", " 'uniq': 793976,\n", " 'wc': 111155337},\n", " 'nara': {'funiq': 1082133,\n", " 'fwc': 54355031,\n", " 'haps': 990235,\n", " 'lowerhaps': 978968,\n", " 'uniq': 1082259,\n", " 'wc': 65649116},\n", " 'nocar': {'funiq': 258024,\n", " 'fwc': 27360155,\n", " 'haps': 165815,\n", " 'lowerhaps': 157524,\n", " 'uniq': 258151,\n", " 'wc': 33487819},\n", " 'nocoll': {'funiq': 1785,\n", " 'fwc': 4626,\n", " 'haps': 1307,\n", " 'lowerhaps': 1202,\n", " 'uniq': 1867,\n", " 'wc': 6192},\n", " 'rumsey': {'funiq': 47343,\n", " 'fwc': 8825833,\n", " 'haps': 14682,\n", " 'lowerhaps': 12520,\n", " 'uniq': 47463,\n", " 'wc': 11667865},\n", " 'smiths': {'funiq': 432279,\n", " 'fwc': 51922316,\n", " 'haps': 182337,\n", " 'lowerhaps': 157337,\n", " 'uniq': 432406,\n", " 'wc': 59927374},\n", " 'socar': {'funiq': 61687,\n", " 'fwc': 5809794,\n", " 'haps': 31055,\n", " 'lowerhaps': 25606,\n", " 'uniq': 61813,\n", " 'wc': 7138136},\n", " 'texas': {'funiq': 855594,\n", " 'fwc': 72699549,\n", " 'haps': 245710,\n", " 'lowerhaps': 237998,\n", " 'uniq': 855720,\n", " 'wc': 88574895},\n", " 'usc': {'funiq': 259523,\n", " 'fwc': 41542851,\n", " 'haps': 108852,\n", " 'lowerhaps': 94218,\n", " 'uniq': 259650,\n", " 'wc': 49296854},\n", " 'virginia': {'funiq': 41374,\n", " 'fwc': 1790985,\n", " 'haps': 32902,\n", " 'lowerhaps': 31738,\n", " 'uniq': 41493,\n", " 'wc': 2248517}}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "df = pd.DataFrame(stats)\n", "df.columns = ['ARTstor', 'Biodiversity Heritage Library', 'Digital Commonwealth', 'Digital Library of Georgia',\n", " 'J. Paul Getty Trust', 'United States Government Printing Office (GPO)', 'Harvard Library',\n", " 'Internet Archive', 'University of Illinois at Urbana-Champaign', 'Kentucky Digital Library',\n", " 'Minnesota Digital Library', 'Missouri Hub', 'Mountain West Digital Library',\n", " 'National Archives and Records Administration', 'North Carolina Digital Heritage Center',\n", " ' ', 'David Rumsey', 'Smithsonian Institution', 'South Carolina Digital Library', \n", " 'The Portal to Texas History', 'University of Southern California. Libraries',\n", " 'University of Virginia Library']\n", "df.T\n", "df.T.to_csv(\"nltk.stats.csv\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | index | \n", "variable | \n", "value | \n", "
---|---|---|---|
15 | \n", "\n", " | funiq | \n", "1785 | \n", "
37 | \n", "\n", " | fwc | \n", "4626 | \n", "
59 | \n", "\n", " | haps | \n", "1307 | \n", "
81 | \n", "\n", " | lowerhaps | \n", "1202 | \n", "
103 | \n", "\n", " | uniq | \n", "1867 | \n", "
125 | \n", "\n", " | wc | \n", "6192 | \n", "
0 | \n", "ARTstor | \n", "funiq | \n", "60168 | \n", "
22 | \n", "ARTstor | \n", "fwc | \n", "5025070 | \n", "
44 | \n", "ARTstor | \n", "haps | \n", "29757 | \n", "
66 | \n", "ARTstor | \n", "lowerhaps | \n", "24103 | \n", "
88 | \n", "ARTstor | \n", "uniq | \n", "60293 | \n", "
110 | \n", "ARTstor | \n", "wc | \n", "6972534 | \n", "
1 | \n", "Biodiversity Heritage Library | \n", "funiq | \n", "94248 | \n", "
23 | \n", "Biodiversity Heritage Library | \n", "fwc | \n", "5658739 | \n", "
45 | \n", "Biodiversity Heritage Library | \n", "haps | \n", "44804 | \n", "
67 | \n", "Biodiversity Heritage Library | \n", "lowerhaps | \n", "38471 | \n", "
89 | \n", "Biodiversity Heritage Library | \n", "uniq | \n", "94372 | \n", "
111 | \n", "Biodiversity Heritage Library | \n", "wc | \n", "6381376 | \n", "
16 | \n", "David Rumsey | \n", "funiq | \n", "47343 | \n", "
38 | \n", "David Rumsey | \n", "fwc | \n", "8825833 | \n", "
60 | \n", "David Rumsey | \n", "haps | \n", "14682 | \n", "
82 | \n", "David Rumsey | \n", "lowerhaps | \n", "12520 | \n", "
104 | \n", "David Rumsey | \n", "uniq | \n", "47463 | \n", "
126 | \n", "David Rumsey | \n", "wc | \n", "11667865 | \n", "
2 | \n", "Digital Commonwealth | \n", "funiq | \n", "204577 | \n", "
24 | \n", "Digital Commonwealth | \n", "fwc | \n", "11348522 | \n", "
46 | \n", "Digital Commonwealth | \n", "haps | \n", "159095 | \n", "
68 | \n", "Digital Commonwealth | \n", "lowerhaps | \n", "154009 | \n", "
90 | \n", "Digital Commonwealth | \n", "uniq | \n", "204703 | \n", "
112 | \n", "Digital Commonwealth | \n", "wc | \n", "14022356 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
19 | \n", "The Portal to Texas History | \n", "funiq | \n", "855594 | \n", "
41 | \n", "The Portal to Texas History | \n", "fwc | \n", "72699549 | \n", "
63 | \n", "The Portal to Texas History | \n", "haps | \n", "245710 | \n", "
85 | \n", "The Portal to Texas History | \n", "lowerhaps | \n", "237998 | \n", "
107 | \n", "The Portal to Texas History | \n", "uniq | \n", "855720 | \n", "
129 | \n", "The Portal to Texas History | \n", "wc | \n", "88574895 | \n", "
5 | \n", "United States Government Printing Office (GPO) | \n", "funiq | \n", "437646 | \n", "
27 | \n", "United States Government Printing Office (GPO) | \n", "fwc | \n", "21860075 | \n", "
49 | \n", "United States Government Printing Office (GPO) | \n", "haps | \n", "351637 | \n", "
71 | \n", "United States Government Printing Office (GPO) | \n", "lowerhaps | \n", "343619 | \n", "
93 | \n", "United States Government Printing Office (GPO) | \n", "uniq | \n", "437770 | \n", "
115 | \n", "United States Government Printing Office (GPO) | \n", "wc | \n", "26316103 | \n", "
8 | \n", "University of Illinois at Urbana-Champaign | \n", "funiq | \n", "49018 | \n", "
30 | \n", "University of Illinois at Urbana-Champaign | \n", "fwc | \n", "1829267 | \n", "
52 | \n", "University of Illinois at Urbana-Champaign | \n", "haps | \n", "29755 | \n", "
74 | \n", "University of Illinois at Urbana-Champaign | \n", "lowerhaps | \n", "23849 | \n", "
96 | \n", "University of Illinois at Urbana-Champaign | \n", "uniq | \n", "49143 | \n", "
118 | \n", "University of Illinois at Urbana-Champaign | \n", "wc | \n", "2385501 | \n", "
20 | \n", "University of Southern California. Libraries | \n", "funiq | \n", "259523 | \n", "
42 | \n", "University of Southern California. Libraries | \n", "fwc | \n", "41542851 | \n", "
64 | \n", "University of Southern California. Libraries | \n", "haps | \n", "108852 | \n", "
86 | \n", "University of Southern California. Libraries | \n", "lowerhaps | \n", "94218 | \n", "
108 | \n", "University of Southern California. Libraries | \n", "uniq | \n", "259650 | \n", "
130 | \n", "University of Southern California. Libraries | \n", "wc | \n", "49296854 | \n", "
21 | \n", "University of Virginia Library | \n", "funiq | \n", "41374 | \n", "
43 | \n", "University of Virginia Library | \n", "fwc | \n", "1790985 | \n", "
65 | \n", "University of Virginia Library | \n", "haps | \n", "32902 | \n", "
87 | \n", "University of Virginia Library | \n", "lowerhaps | \n", "31738 | \n", "
109 | \n", "University of Virginia Library | \n", "uniq | \n", "41493 | \n", "
131 | \n", "University of Virginia Library | \n", "wc | \n", "2248517 | \n", "
132 rows × 3 columns
\n", "