{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pickle\n", "import nltk\n", "\n", "stats = pickle.load( open( \"/media/storage/dpla-data/pickles/new/newstats.p\", \"rb\" ) )\n", "common = pickle.load( open( \"/media/storage/dpla-data/pickles/new/common.p\", \"rb\" ) )\n", "searcom = pickle.load( open( \"/media/storage/dpla-data/pickles/new/sear_common.p\", \"rb\" ) )\n", "searfilt = pickle.load(open( \"/media/storage/dpla-data/pickles/new/searches_filtered.p\", \"rb\" ) ) " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'artstor': {'funiq': 60168,\n", " 'fwc': 5025070,\n", " 'haps': 29757,\n", " 'lowerhaps': 24103,\n", " 'uniq': 60293,\n", " 'wc': 6972534},\n", " 'biodiv': {'funiq': 94248,\n", " 'fwc': 5658739,\n", " 'haps': 44804,\n", " 'lowerhaps': 38471,\n", " 'uniq': 94372,\n", " 'wc': 6381376},\n", " 'commonwealth': {'funiq': 204577,\n", " 'fwc': 11348522,\n", " 'haps': 159095,\n", " 'lowerhaps': 154009,\n", " 'uniq': 204703,\n", " 'wc': 14022356},\n", " 'georgia': {'funiq': 150863,\n", " 'fwc': 32656431,\n", " 'haps': 89668,\n", " 'lowerhaps': 79492,\n", " 'uniq': 150990,\n", " 'wc': 42031491},\n", " 'getty': {'funiq': 54355,\n", " 'fwc': 14251103,\n", " 'haps': 11663,\n", " 'lowerhaps': 9767,\n", " 'uniq': 54474,\n", " 'wc': 18732730},\n", " 'gpo': {'funiq': 437646,\n", " 'fwc': 21860075,\n", " 'haps': 351637,\n", " 'lowerhaps': 343619,\n", " 'uniq': 437770,\n", " 'wc': 26316103},\n", " 'harvard': {'funiq': 35918,\n", " 'fwc': 849987,\n", " 'haps': 20447,\n", " 'lowerhaps': 18025,\n", " 'uniq': 36036,\n", " 'wc': 968898},\n", " 'ia': {'funiq': 502974,\n", " 'fwc': 16996418,\n", " 'haps': 394559,\n", " 'lowerhaps': 378206,\n", " 'uniq': 503101,\n", " 'wc': 23288038},\n", " 'illinois': {'funiq': 49018,\n", " 'fwc': 1829267,\n", " 'haps': 29755,\n", " 'lowerhaps': 23849,\n", " 'uniq': 49143,\n", " 'wc': 2385501},\n", " 'kentucky': {'funiq': 30374,\n", " 'fwc': 6800530,\n", " 'haps': 14090,\n", " 'lowerhaps': 11338,\n", " 'uniq': 30498,\n", " 'wc': 9405279},\n", " 'minnesota': {'funiq': 43666,\n", " 'fwc': 3598870,\n", " 'haps': 21112,\n", " 'lowerhaps': 17674,\n", " 'uniq': 43791,\n", " 'wc': 4495075},\n", " 'missouri': {'funiq': 119586,\n", " 'fwc': 3542143,\n", " 'haps': 90859,\n", " 'lowerhaps': 85222,\n", " 'uniq': 119713,\n", " 'wc': 4256929},\n", " 'mwdl': {'funiq': 793849,\n", " 'fwc': 87424176,\n", " 'haps': 542772,\n", " 'lowerhaps': 504876,\n", " 'uniq': 793976,\n", " 'wc': 111155337},\n", " 'nara': {'funiq': 1082133,\n", " 'fwc': 54355031,\n", " 'haps': 990235,\n", " 'lowerhaps': 978968,\n", " 'uniq': 1082259,\n", " 'wc': 65649116},\n", " 'nocar': {'funiq': 258024,\n", " 'fwc': 27360155,\n", " 'haps': 165815,\n", " 'lowerhaps': 157524,\n", " 'uniq': 258151,\n", " 'wc': 33487819},\n", " 'nocoll': {'funiq': 1785,\n", " 'fwc': 4626,\n", " 'haps': 1307,\n", " 'lowerhaps': 1202,\n", " 'uniq': 1867,\n", " 'wc': 6192},\n", " 'rumsey': {'funiq': 47343,\n", " 'fwc': 8825833,\n", " 'haps': 14682,\n", " 'lowerhaps': 12520,\n", " 'uniq': 47463,\n", " 'wc': 11667865},\n", " 'smiths': {'funiq': 432279,\n", " 'fwc': 51922316,\n", " 'haps': 182337,\n", " 'lowerhaps': 157337,\n", " 'uniq': 432406,\n", " 'wc': 59927374},\n", " 'socar': {'funiq': 61687,\n", " 'fwc': 5809794,\n", " 'haps': 31055,\n", " 'lowerhaps': 25606,\n", " 'uniq': 61813,\n", " 'wc': 7138136},\n", " 'texas': {'funiq': 855594,\n", " 'fwc': 72699549,\n", " 'haps': 245710,\n", " 'lowerhaps': 237998,\n", " 'uniq': 855720,\n", " 'wc': 88574895},\n", " 'usc': {'funiq': 259523,\n", " 'fwc': 41542851,\n", " 'haps': 108852,\n", " 'lowerhaps': 94218,\n", " 'uniq': 259650,\n", " 'wc': 49296854},\n", " 'virginia': {'funiq': 41374,\n", " 'fwc': 1790985,\n", " 'haps': 32902,\n", " 'lowerhaps': 31738,\n", " 'uniq': 41493,\n", " 'wc': 2248517}}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "df = pd.DataFrame(stats)\n", "df.columns = ['ARTstor', 'Biodiversity Heritage Library', 'Digital Commonwealth', 'Digital Library of Georgia',\n", " 'J. Paul Getty Trust', 'United States Government Printing Office (GPO)', 'Harvard Library',\n", " 'Internet Archive', 'University of Illinois at Urbana-Champaign', 'Kentucky Digital Library',\n", " 'Minnesota Digital Library', 'Missouri Hub', 'Mountain West Digital Library',\n", " 'National Archives and Records Administration', 'North Carolina Digital Heritage Center',\n", " ' ', 'David Rumsey', 'Smithsonian Institution', 'South Carolina Digital Library', \n", " 'The Portal to Texas History', 'University of Southern California. Libraries',\n", " 'University of Virginia Library']\n", "df.T\n", "df.T.to_csv(\"nltk.stats.csv\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexvariablevalue
15 funiq 1785
37 fwc 4626
59 haps 1307
81 lowerhaps 1202
103 uniq 1867
125 wc 6192
0 ARTstor funiq 60168
22 ARTstor fwc 5025070
44 ARTstor haps 29757
66 ARTstor lowerhaps 24103
88 ARTstor uniq 60293
110 ARTstor wc 6972534
1 Biodiversity Heritage Library funiq 94248
23 Biodiversity Heritage Library fwc 5658739
45 Biodiversity Heritage Library haps 44804
67 Biodiversity Heritage Library lowerhaps 38471
89 Biodiversity Heritage Library uniq 94372
111 Biodiversity Heritage Library wc 6381376
16 David Rumsey funiq 47343
38 David Rumsey fwc 8825833
60 David Rumsey haps 14682
82 David Rumsey lowerhaps 12520
104 David Rumsey uniq 47463
126 David Rumsey wc 11667865
2 Digital Commonwealth funiq 204577
24 Digital Commonwealth fwc 11348522
46 Digital Commonwealth haps 159095
68 Digital Commonwealth lowerhaps 154009
90 Digital Commonwealth uniq 204703
112 Digital Commonwealth wc 14022356
............
19 The Portal to Texas History funiq 855594
41 The Portal to Texas History fwc 72699549
63 The Portal to Texas History haps 245710
85 The Portal to Texas History lowerhaps 237998
107 The Portal to Texas History uniq 855720
129 The Portal to Texas History wc 88574895
5 United States Government Printing Office (GPO) funiq 437646
27 United States Government Printing Office (GPO) fwc 21860075
49 United States Government Printing Office (GPO) haps 351637
71 United States Government Printing Office (GPO) lowerhaps 343619
93 United States Government Printing Office (GPO) uniq 437770
115 United States Government Printing Office (GPO) wc 26316103
8 University of Illinois at Urbana-Champaign funiq 49018
30 University of Illinois at Urbana-Champaign fwc 1829267
52 University of Illinois at Urbana-Champaign haps 29755
74 University of Illinois at Urbana-Champaign lowerhaps 23849
96 University of Illinois at Urbana-Champaign uniq 49143
118 University of Illinois at Urbana-Champaign wc 2385501
20 University of Southern California. Libraries funiq 259523
42 University of Southern California. Libraries fwc 41542851
64 University of Southern California. Libraries haps 108852
86 University of Southern California. Libraries lowerhaps 94218
108 University of Southern California. Libraries uniq 259650
130 University of Southern California. Libraries wc 49296854
21 University of Virginia Library funiq 41374
43 University of Virginia Library fwc 1790985
65 University of Virginia Library haps 32902
87 University of Virginia Library lowerhaps 31738
109 University of Virginia Library uniq 41493
131 University of Virginia Library wc 2248517
\n", "

132 rows × 3 columns

\n", "
" ], "text/plain": [ " index variable value\n", "15 funiq 1785\n", "37 fwc 4626\n", "59 haps 1307\n", "81 lowerhaps 1202\n", "103 uniq 1867\n", "125 wc 6192\n", "0 ARTstor funiq 60168\n", "22 ARTstor fwc 5025070\n", "44 ARTstor haps 29757\n", "66 ARTstor lowerhaps 24103\n", "88 ARTstor uniq 60293\n", "110 ARTstor wc 6972534\n", "1 Biodiversity Heritage Library funiq 94248\n", "23 Biodiversity Heritage Library fwc 5658739\n", "45 Biodiversity Heritage Library haps 44804\n", "67 Biodiversity Heritage Library lowerhaps 38471\n", "89 Biodiversity Heritage Library uniq 94372\n", "111 Biodiversity Heritage Library wc 6381376\n", "16 David Rumsey funiq 47343\n", "38 David Rumsey fwc 8825833\n", "60 David Rumsey haps 14682\n", "82 David Rumsey lowerhaps 12520\n", "104 David Rumsey uniq 47463\n", "126 David Rumsey wc 11667865\n", "2 Digital Commonwealth funiq 204577\n", "24 Digital Commonwealth fwc 11348522\n", "46 Digital Commonwealth haps 159095\n", "68 Digital Commonwealth lowerhaps 154009\n", "90 Digital Commonwealth uniq 204703\n", "112 Digital Commonwealth wc 14022356\n", ".. ... ... ...\n", "19 The Portal to Texas History funiq 855594\n", "41 The Portal to Texas History fwc 72699549\n", "63 The Portal to Texas History haps 245710\n", "85 The Portal to Texas History lowerhaps 237998\n", "107 The Portal to Texas History uniq 855720\n", "129 The Portal to Texas History wc 88574895\n", "5 United States Government Printing Office (GPO) funiq 437646\n", "27 United States Government Printing Office (GPO) fwc 21860075\n", "49 United States Government Printing Office (GPO) haps 351637\n", "71 United States Government Printing Office (GPO) lowerhaps 343619\n", "93 United States Government Printing Office (GPO) uniq 437770\n", "115 United States Government Printing Office (GPO) wc 26316103\n", "8 University of Illinois at Urbana-Champaign funiq 49018\n", "30 University of Illinois at Urbana-Champaign fwc 1829267\n", "52 University of Illinois at Urbana-Champaign haps 29755\n", "74 University of Illinois at Urbana-Champaign lowerhaps 23849\n", "96 University of Illinois at Urbana-Champaign uniq 49143\n", "118 University of Illinois at Urbana-Champaign wc 2385501\n", "20 University of Southern California. Libraries funiq 259523\n", "42 University of Southern California. Libraries fwc 41542851\n", "64 University of Southern California. Libraries haps 108852\n", "86 University of Southern California. Libraries lowerhaps 94218\n", "108 University of Southern California. Libraries uniq 259650\n", "130 University of Southern California. Libraries wc 49296854\n", "21 University of Virginia Library funiq 41374\n", "43 University of Virginia Library fwc 1790985\n", "65 University of Virginia Library haps 32902\n", "87 University of Virginia Library lowerhaps 31738\n", "109 University of Virginia Library uniq 41493\n", "131 University of Virginia Library wc 2248517\n", "\n", "[132 rows x 3 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.display import display\n", "display(pd.melt(df.T.reset_index(), id_vars=['index']).sort('index'))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pd.melt(df.T.reset_index(), id_vars=['index']).sort('index').to_csv('nltk.stats.melted.tmp.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('war', 9316),\n", " ('history', 7940),\n", " ('new', 7432),\n", " ('county', 7030),\n", " ('georgia', 6108),\n", " ('university', 6024),\n", " ('american', 5883),\n", " ('library', 5799),\n", " ('john', 5015),\n", " ('world', 4293),\n", " ('women', 4092),\n", " ('york', 3972),\n", " ('civil', 3853),\n", " ('states', 3604),\n", " ('united', 3590),\n", " ('carolina', 3570),\n", " ('de', 3223),\n", " ('william', 3148),\n", " ('south', 3072),\n", " ('art', 2864),\n", " ('school', 2734),\n", " ('utah', 2726),\n", " ('city', 2682),\n", " ('north', 2581),\n", " ('state', 2563),\n", " ('james', 2517),\n", " ('public', 2490),\n", " ('family', 2335),\n", " ('c', 2308),\n", " ('boston', 2280),\n", " ('george', 2254),\n", " ('college', 2159),\n", " ('map', 2154),\n", " ('atlanta', 2126),\n", " ('america', 2103),\n", " ('national', 2032),\n", " ('thomas', 1979),\n", " ('virginia', 1950),\n", " ('great', 1883),\n", " ('charles', 1825),\n", " ('st', 1820),\n", " ('texas', 1818),\n", " ('ga', 1804),\n", " ('washington', 1801),\n", " ('california', 1801),\n", " ('african', 1779),\n", " ('robert', 1727),\n", " ('minnesota', 1722),\n", " ('rights', 1708),\n", " ('genealogy', 1702),\n", " ('book', 1680),\n", " ('digital', 1634),\n", " ('lat', 1617),\n", " ('church', 1616),\n", " ('black', 1614),\n", " ('massachusetts', 1588),\n", " ('henry', 1587),\n", " ('king', 1584),\n", " ('education', 1581),\n", " ('j', 1581),\n", " ('life', 1577),\n", " ('books', 1568),\n", " ('d', 1525),\n", " ('archives', 1522),\n", " ('maps', 1500),\n", " ('music', 1472),\n", " ('century', 1468),\n", " ('west', 1450),\n", " ('la', 1434),\n", " ('b', 1430),\n", " ('e', 1429),\n", " ('park', 1420),\n", " ('david', 1411),\n", " ('railroad', 1370),\n", " ('island', 1354),\n", " ('ohio', 1343),\n", " ('museum', 1335),\n", " ('photographs', 1328),\n", " ('house', 1314),\n", " ('chicago', 1304),\n", " ('children', 1289),\n", " ('h', 1288),\n", " ('social', 1287),\n", " ('street', 1254),\n", " ('science', 1250),\n", " ('san', 1244),\n", " ('society', 1214),\n", " ('illinois', 1206),\n", " ('f', 1205),\n", " ('lake', 1187),\n", " ('works', 1174),\n", " ('w', 1168),\n", " ('law', 1162),\n", " ('revolution', 1161),\n", " ('martin', 1157),\n", " ('indian', 1149),\n", " ('indians', 1139),\n", " ('river', 1139),\n", " ('english', 1132),\n", " ('americans', 1126),\n", " ('libraries', 1100),\n", " ('architecture', 1091),\n", " ('mary', 1089),\n", " ('l', 1088),\n", " ('m', 1084),\n", " ('ny', 1080),\n", " ('historical', 1079),\n", " ('paul', 1065),\n", " ('kentucky', 1053),\n", " ('michigan', 1044),\n", " ('records', 1042),\n", " ('joseph', 1040),\n", " ('fiction', 1040),\n", " ('high', 1031),\n", " ('ii', 1020),\n", " ('r', 1019),\n", " ('center', 1009),\n", " ('37', 1007),\n", " ('government', 999),\n", " ('mass', 989),\n", " ('louis', 988),\n", " ('china', 983),\n", " ('act', 969),\n", " ('native', 966),\n", " ('996162679728116', 957),\n", " ('edward', 955),\n", " ('slavery', 941),\n", " ('vietnam', 930),\n", " ('company', 921),\n", " ('smith', 919),\n", " ('business', 919),\n", " ('lincoln', 916),\n", " ('literature', 899),\n", " ('death', 899),\n", " ('japanese', 897),\n", " ('woman', 894),\n", " ('language', 882),\n", " ('fire', 878),\n", " ('white', 876),\n", " ('day', 870),\n", " ('buildings', 865),\n", " ('richard', 860),\n", " ('pennsylvania', 853),\n", " ('chinese', 851),\n", " ('early', 845),\n", " ('spanish', 844),\n", " ('luther', 840),\n", " ('kennedy', 840),\n", " ('slave', 831),\n", " ('roosevelt', 826),\n", " ('ma', 816),\n", " ('brown', 812),\n", " ('general', 809),\n", " ('missouri', 809),\n", " ('design', 807),\n", " ('florida', 807),\n", " ('o', 806),\n", " ('frank', 806),\n", " ('collection', 805),\n", " ('first', 805),\n", " ('french', 802),\n", " ('child', 801),\n", " ('n', 797),\n", " ('pa', 796),\n", " ('management', 795),\n", " ('military', 792),\n", " ('old', 790),\n", " ('england', 781),\n", " ('lewis', 779),\n", " ('army', 774),\n", " ('southern', 769),\n", " ('mountain', 769),\n", " ('us', 767),\n", " ('los', 764),\n", " ('2', 762),\n", " ('u', 761),\n", " ('jackson', 759),\n", " ('labor', 755),\n", " ('indiana', 754),\n", " ('health', 752),\n", " ('bible', 752),\n", " ('immigration', 750),\n", " ('colorado', 749),\n", " ('ca', 747),\n", " ('mexico', 746),\n", " ('depression', 742),\n", " ('battle', 742),\n", " ('red', 740),\n", " ('union', 736),\n", " ('harvard', 730),\n", " ('lee', 728),\n", " ('baseball', 725),\n", " ('samuel', 723),\n", " ('man', 722),\n", " ('g', 717),\n", " ('nc', 716),\n", " ('1865', 716),\n", " ('east', 716),\n", " ('hill', 713),\n", " ('air', 710),\n", " ('p', 705),\n", " ('home', 705),\n", " ('image', 704),\n", " ('arizona', 702),\n", " ('research', 701),\n", " ('1', 701),\n", " ('alabama', 699),\n", " ('maine', 699),\n", " ('journal', 698),\n", " ('wisconsin', 695),\n", " ('technology', 693),\n", " ('people', 693),\n", " ('franklin', 692),\n", " ('tennessee', 689),\n", " ('men', 680),\n", " ('ancient', 680),\n", " ('fort', 679),\n", " ('theater', 672),\n", " ('little', 668),\n", " ('schools', 667),\n", " ('food', 665),\n", " ('france', 657),\n", " ('mn', 653),\n", " ('culture', 652),\n", " ('pictorial', 647),\n", " ('connecticut', 642),\n", " ('politics', 641),\n", " ('newspapers', 634),\n", " ('collections', 633),\n", " ('work', 626),\n", " ('1918', 625),\n", " ('francisco', 625),\n", " ('smithsonian', 624),\n", " ('oregon', 622),\n", " ('charleston', 621),\n", " ('movement', 618),\n", " ('medical', 617),\n", " ('german', 614),\n", " ('travel', 613),\n", " ('special', 613),\n", " ('johnson', 608),\n", " ('germany', 607),\n", " ('portrait', 605),\n", " ('angeles', 602),\n", " ('1920', 602),\n", " ('elizabeth', 600),\n", " ('peter', 597),\n", " ('information', 596),\n", " ('gold', 595),\n", " ('rock', 595),\n", " ('1800', 594),\n", " ('scott', 591),\n", " ('1861', 588),\n", " ('1914', 586),\n", " ('computer', 585),\n", " ('1945', 585),\n", " ('mining', 585),\n", " ('harry', 585),\n", " ('benjamin', 584),\n", " ('suffrage', 581),\n", " ('kansas', 575),\n", " ('camp', 575),\n", " ('india', 574),\n", " ('magazine', 573),\n", " ('clark', 572),\n", " ('v', 570),\n", " ('power', 568),\n", " ('alexander', 566),\n", " ('human', 566),\n", " ('service', 565),\n", " ('iowa', 563),\n", " ('michael', 563),\n", " ('photography', 562),\n", " ('letter', 562),\n", " ('religion', 561),\n", " ('medicine', 560),\n", " ('building', 557),\n", " ('portraits', 555),\n", " ('department', 554),\n", " ('project', 554),\n", " ('van', 549),\n", " ('water', 547),\n", " ('1939', 546),\n", " ('hall', 546),\n", " ('trade', 546),\n", " ('mississippi', 544),\n", " ('valley', 541),\n", " ('industry', 541),\n", " ('co', 540),\n", " ('philadelphia', 539),\n", " ('london', 538),\n", " ('young', 537),\n", " ('arts', 537),\n", " ('etc', 537),\n", " ('poetry', 535),\n", " ('institute', 535),\n", " ('horse', 533),\n", " ('arthur', 532),\n", " ('japan', 531),\n", " ('frederick', 531),\n", " ('columbia', 530),\n", " ('20th', 529),\n", " ('political', 526),\n", " ('race', 525),\n", " ('minneapolis', 524),\n", " ('sports', 524),\n", " ('maryland', 523),\n", " ('jersey', 523),\n", " ('space', 522),\n", " ('report', 521),\n", " ('dr', 521),\n", " ('mark', 520),\n", " ('students', 520),\n", " ('deal', 516),\n", " ('administration', 514),\n", " ('soldiers', 513),\n", " ('jr', 512),\n", " ('yale', 512),\n", " ('road', 511),\n", " ('international', 511),\n", " ('jones', 510),\n", " ('independence', 509),\n", " ('herald', 508),\n", " ('1900', 507),\n", " ('farm', 507),\n", " ('colonial', 506),\n", " ('nevada', 504),\n", " ('irish', 501),\n", " ('jane', 501),\n", " ('modern', 500),\n", " ('newspaper', 500),\n", " ('dog', 500),\n", " ('psychology', 498),\n", " ('pacific', 498),\n", " ('jefferson', 497),\n", " ('salt', 496),\n", " ('williams', 493),\n", " ('va', 493),\n", " ('theory', 490),\n", " ('letters', 490),\n", " ('system', 490),\n", " ('fair', 490),\n", " ('saint', 489),\n", " ('group', 489),\n", " ('green', 487),\n", " ('land', 483),\n", " ('russian', 482),\n", " ('free', 481),\n", " ('cherokee', 481),\n", " ('abraham', 481),\n", " ('time', 479),\n", " ('field', 478),\n", " ('adams', 475),\n", " ('guide', 475),\n", " ('africa', 474),\n", " ('british', 473),\n", " ('sc', 470),\n", " ('radio', 470),\n", " ('bill', 470),\n", " ('pictures', 470),\n", " ('beach', 469),\n", " ('central', 469),\n", " ('domain', 468),\n", " ('1940', 467),\n", " ('andrew', 464),\n", " ('walter', 462),\n", " ('louisiana', 460),\n", " ('theatre', 459),\n", " ('one', 458),\n", " ('francis', 453),\n", " ('girl', 452),\n", " ('hospital', 452),\n", " ('springs', 452),\n", " ('wilson', 449),\n", " ('police', 449),\n", " ('customs', 448),\n", " ('ford', 448),\n", " ('rumsey', 448),\n", " ('el', 447),\n", " ('stephen', 447),\n", " ('christian', 446),\n", " ('brooklyn', 445),\n", " ('albert', 443),\n", " ('jack', 443),\n", " ('shakespeare', 443),\n", " ('spain', 440),\n", " ('workers', 439),\n", " ('heritage', 438),\n", " ('wright', 437),\n", " ('hotel', 436),\n", " ('k', 433),\n", " ('immigrants', 433),\n", " ('engineering', 432),\n", " ('wwii', 432),\n", " ('media', 432),\n", " ('daniel', 431),\n", " ('19th', 429),\n", " ('gallery', 428),\n", " ('study', 428),\n", " ('two', 427),\n", " ('photos', 427),\n", " ('laws', 426),\n", " ('march', 425),\n", " ('periodicals', 425),\n", " ('y', 421),\n", " ('postcard', 420),\n", " ('philosophy', 419),\n", " ('montana', 419),\n", " ('squadron', 419),\n", " ('dance', 417),\n", " ('industrial', 416),\n", " ('western', 416),\n", " ('salem', 416),\n", " ('1950', 415),\n", " ('jewish', 415),\n", " ('hitler', 415),\n", " ('images', 414),\n", " ('president', 412),\n", " ('tom', 411),\n", " ('parks', 411),\n", " ('economics', 409),\n", " ('mexican', 409),\n", " ('gay', 409),\n", " ('al', 408),\n", " ('football', 408),\n", " ('creek', 408),\n", " ('gordon', 406),\n", " ('edgar', 406),\n", " ('howard', 405),\n", " ('census', 405),\n", " ('allen', 405),\n", " ('club', 404),\n", " ('detroit', 403),\n", " ('domestic', 401),\n", " ('construction', 400),\n", " ('association', 399),\n", " ('reading', 399),\n", " ('biography', 398),\n", " ('congress', 398),\n", " ('age', 397),\n", " ('lawrence', 395),\n", " ('community', 394),\n", " ('forest', 393),\n", " ('von', 392),\n", " ('sex', 392),\n", " ('love', 391),\n", " ('bridge', 390),\n", " ('description', 389),\n", " ('arkansas', 389),\n", " ('game', 389),\n", " ('declaration', 388),\n", " ('des', 388),\n", " ('development', 388),\n", " ('historic', 387),\n", " ('security', 387),\n", " ('town', 387),\n", " ('soviet', 386),\n", " ('may', 386),\n", " ('margaret', 386),\n", " ('federal', 385),\n", " ('11', 385),\n", " ('davis', 385),\n", " ('greek', 384),\n", " ('long', 383),\n", " ('tribune', 383),\n", " ('big', 383),\n", " ('natural', 380),\n", " ('miller', 380),\n", " ('control', 379),\n", " ('news', 379),\n", " ('dakota', 378),\n", " ('1930', 373),\n", " ('stone', 373),\n", " ('hampshire', 372),\n", " ('europe', 370),\n", " ('newton', 370),\n", " ('anne', 370),\n", " ('globe', 370),\n", " ('paris', 369),\n", " ('animal', 368),\n", " ('navy', 367),\n", " ('fashion', 366),\n", " ('britain', 366),\n", " ('santa', 365),\n", " ('slaves', 364),\n", " ('diary', 363),\n", " ('alice', 363),\n", " ('film', 362),\n", " ('empire', 362),\n", " ('charlotte', 362),\n", " ('avenue', 362),\n", " ('dept', 362),\n", " ('policy', 362),\n", " ('relations', 360),\n", " ('constitution', 360),\n", " ('catholic', 360),\n", " ('negro', 359),\n", " ('columbus', 359),\n", " ('cat', 359),\n", " ('jean', 358),\n", " ('ireland', 358),\n", " ('il', 358),\n", " ('corps', 357),\n", " ('garden', 357),\n", " ('post', 357),\n", " ('photo', 357),\n", " ('oklahoma', 356),\n", " ('russia', 356),\n", " ('russell', 356),\n", " ('car', 355),\n", " ('physical', 354),\n", " ('warren', 354),\n", " ('mill', 353),\n", " ('analysis', 352),\n", " ('party', 352),\n", " ('medieval', 351),\n", " ('canada', 351),\n", " ('sea', 350),\n", " ('nebraska', 350),\n", " ('freedom', 349),\n", " ('idaho', 349),\n", " ('learning', 348),\n", " ('painting', 348),\n", " ('oil', 348),\n", " ('agriculture', 348),\n", " ('duluth', 347),\n", " ('augusta', 347),\n", " ('1960', 346),\n", " ('poster', 346),\n", " ('ship', 345),\n", " ('cotton', 344),\n", " ('papers', 342),\n", " ('conservation', 342),\n", " ('aerial', 342),\n", " ('video', 341),\n", " ('cold', 340),\n", " ('times', 340),\n", " ('directory', 340),\n", " ('archive', 340),\n", " ('railroads', 339),\n", " ('poe', 339),\n", " ('ann', 338),\n", " ('bay', 338),\n", " ('review', 337),\n", " ('houses', 336),\n", " ('ky', 335),\n", " ('transportation', 335),\n", " ('grand', 334),\n", " ('strike', 334),\n", " ('vermont', 333),\n", " ('jim', 333),\n", " ('games', 330),\n", " ('theodore', 330),\n", " ('insignia', 330),\n", " ('twain', 330),\n", " ('photograph', 330),\n", " ('roman', 329),\n", " ('advertising', 328),\n", " ('train', 327),\n", " ('middle', 326),\n", " ('fishing', 324),\n", " ('trail', 324),\n", " ('dead', 324),\n", " ('revolutionary', 322),\n", " ('religious', 321),\n", " ('point', 321),\n", " ('story', 320),\n", " ('snow', 320),\n", " ('wood', 319),\n", " ('carter', 319),\n", " ('rush', 319),\n", " ('le', 319),\n", " ('falls', 319),\n", " ('italy', 318),\n", " ('prohibition', 318),\n", " ('fish', 317),\n", " ('orleans', 316),\n", " ('harbor', 315),\n", " ('bird', 314),\n", " ('carl', 313),\n", " ('uss', 313),\n", " ('artstor', 313),\n", " ('walker', 313),\n", " ('rome', 312),\n", " ('propaganda', 311),\n", " ('resources', 311),\n", " ('egypt', 311),\n", " ('nj', 310),\n", " ('jews', 309),\n", " ('office', 309),\n", " ('anti', 308),\n", " ('teaching', 308),\n", " ('clothing', 307),\n", " ('coal', 306),\n", " ('reform', 306),\n", " ('canal', 306),\n", " ('berlin', 306),\n", " ('greece', 306),\n", " ('emily', 306),\n", " ('hawaii', 305),\n", " ('wells', 304),\n", " ('italian', 304),\n", " ('physics', 304),\n", " ('fitzgerald', 302),\n", " ('landscape', 302),\n", " ('boy', 302),\n", " ('division', 302),\n", " ('mormon', 300),\n", " ('atlas', 300),\n", " ('dp', 300),\n", " ('bell', 298),\n", " ('writing', 298),\n", " ('colored', 297),\n", " ('douglas', 297),\n", " ('nmnh', 296),\n", " ('baltimore', 295),\n", " ('holocaust', 295),\n", " ('female', 295),\n", " ('iron', 294),\n", " ('square', 294),\n", " ('foreign', 294),\n", " ('station', 293),\n", " ('renaissance', 293),\n", " ('cross', 293),\n", " ('data', 293),\n", " ('urban', 293),\n", " ('memorial', 293),\n", " ('hemingway', 292),\n", " ('da', 292),\n", " ('helen', 292),\n", " ('1963', 291),\n", " ('girls', 291),\n", " ('taylor', 291),\n", " ('program', 291),\n", " ('press', 290),\n", " ('sources', 290),\n", " ('crime', 289),\n", " ('robinson', 289),\n", " ('nuclear', 288),\n", " ('philip', 288),\n", " ('code', 288),\n", " ('moon', 287),\n", " ('oral', 287),\n", " ('marriage', 287),\n", " ('chemistry', 286),\n", " ('las', 286),\n", " ('board', 285),\n", " ('rhode', 285),\n", " ('x', 285),\n", " ('christmas', 284),\n", " ('marshall', 284),\n", " ('cod', 284),\n", " ('alaska', 283),\n", " ('mission', 283),\n", " ('ernest', 283),\n", " ('internment', 283),\n", " ('views', 282),\n", " ('use', 282),\n", " ('protest', 282),\n", " ('market', 282),\n", " ('brothers', 281),\n", " ('jazz', 281),\n", " ('murray', 281),\n", " ('anthony', 281),\n", " ('madison', 281),\n", " ('et', 280),\n", " ('potter', 280),\n", " ('internet', 280),\n", " ('dick', 279),\n", " ('studies', 279),\n", " ('personal', 279),\n", " ('dogs', 279),\n", " ('queen', 279),\n", " ('dress', 279),\n", " ('camps', 279),\n", " ('walt', 279),\n", " ('second', 278),\n", " ('1961', 278),\n", " ('mills', 278),\n", " ('court', 278),\n", " ('posters', 278),\n", " ('confederate', 277),\n", " ('blue', 277),\n", " ('republic', 276),\n", " ('machine', 276),\n", " ('austin', 276),\n", " ('1929', 276),\n", " ('obituaries', 275),\n", " ('del', 274),\n", " ('grant', 274),\n", " ('glass', 274),\n", " ('emblem', 274),\n", " ('savannah', 273),\n", " ('marketing', 273),\n", " ('manual', 272),\n", " ('brazil', 272),\n", " ('picture', 271),\n", " ('years', 271),\n", " ('programs', 271),\n", " ('baptist', 271),\n", " ('richmond', 271),\n", " ('du', 270),\n", " ('basketball', 269),\n", " ('nelson', 269),\n", " ('pearl', 269),\n", " ('mount', 269),\n", " ('anderson', 269),\n", " ('student', 268),\n", " ('antonio', 268),\n", " ('nursing', 268),\n", " ('automobile', 267),\n", " ('ct', 267),\n", " ('star', 267),\n", " ('interior', 266),\n", " ('electric', 266),\n", " ('christopher', 265),\n", " ('cache', 265),\n", " ('murder', 264),\n", " ('jesus', 264),\n", " ('stories', 263),\n", " ('tree', 263),\n", " ('1920s', 263),\n", " ('der', 262),\n", " ('model', 262),\n", " ('august', 262),\n", " ('earth', 262),\n", " ('com', 262),\n", " ('text', 261),\n", " ('korean', 261),\n", " ('economic', 260),\n", " ('dallas', 260),\n", " ('usa', 260),\n", " ('alfred', 260),\n", " ('ellis', 259),\n", " ('communication', 259),\n", " ('temple', 258),\n", " ('league', 258),\n", " ('joe', 258),\n", " ('isaac', 258),\n", " ('eagle', 257),\n", " ('champaign', 257),\n", " ('year', 256),\n", " ('sarah', 256),\n", " ('sound', 256),\n", " ('housing', 256),\n", " ('witch', 256),\n", " ('ww2', 255),\n", " ('harriet', 255),\n", " ('brain', 255),\n", " ('training', 255),\n", " ('civilian', 255),\n", " ('cleveland', 255),\n", " ('force', 255),\n", " ('statistics', 255),\n", " ('introduction', 255),\n", " ('railway', 254),\n", " ('nature', 254),\n", " ('animals', 254),\n", " ('delaware', 254),\n", " ('massacre', 253),\n", " ('buffalo', 253),\n", " ('television', 251),\n", " ('peace', 251),\n", " ('programming', 251),\n", " ('line', 251),\n", " ('birds', 251),\n", " ('jacob', 250),\n", " ('card', 250),\n", " ('cities', 250),\n", " ('3', 250),\n", " ('dickens', 249),\n", " ('mathematics', 249),\n", " ('1775', 249),\n", " ('plant', 248),\n", " ('search', 248),\n", " ('ray', 247),\n", " ('golden', 247),\n", " ('dream', 246),\n", " ('boys', 246),\n", " ('assassination', 246),\n", " ('insurance', 245),\n", " ('rice', 245),\n", " ('harlem', 244),\n", " ('ethics', 244),\n", " ('hamilton', 244),\n", " ('class', 244),\n", " ('morgan', 244),\n", " ('cuba', 243),\n", " ('care', 243),\n", " ('ice', 243),\n", " ('1917', 243),\n", " ('1860', 243),\n", " ('bowl', 243),\n", " ('mine', 243),\n", " ('mines', 242),\n", " ('families', 242),\n", " ('monroe', 242),\n", " ('nh', 242),\n", " ('speech', 242),\n", " ('duke', 242),\n", " ('herbert', 242),\n", " ('rose', 242),\n", " ('ut', 241),\n", " ('geology', 241),\n", " ('horses', 241),\n", " ('vs', 241),\n", " ('douglass', 240),\n", " ('1910', 240),\n", " ('prince', 240),\n", " ('canyon', 240),\n", " ('catalog', 239),\n", " ('liberty', 239),\n", " ('good', 239),\n", " ('god', 239),\n", " ('reconstruction', 238),\n", " ('criticism', 237),\n", " ('naval', 237),\n", " ('july', 237),\n", " ('test', 237),\n", " ('winter', 237),\n", " ('night', 237),\n", " ('cats', 237),\n", " ('movie', 236),\n", " ('womens', 236),\n", " ('dictionary', 236),\n", " ('correspondence', 236),\n", " ('wpa', 236),\n", " ('ruth', 235),\n", " ('morris', 235),\n", " ('plan', 235),\n", " ('primary', 234),\n", " ('self', 234),\n", " ('country', 234),\n", " ('albany', 234),\n", " ('cooper', 234),\n", " ('mrs', 234),\n", " ('lowell', 233),\n", " ('color', 233),\n", " ('manuscripts', 233),\n", " ('rico', 233),\n", " ('thompson', 233),\n", " ('finance', 233),\n", " ('1975', 233),\n", " ('1970', 233),\n", " ('http', 233),\n", " ('tea', 232),\n", " ('manhattan', 232),\n", " ('district', 232),\n", " ('sam', 232),\n", " ('disney', 231),\n", " ('pittsburgh', 231),\n", " ('plants', 231),\n", " ('puerto', 231),\n", " ('elementary', 231),\n", " ('korea', 231),\n", " ('cars', 230),\n", " ('era', 230),\n", " ('birth', 230),\n", " ('churches', 229),\n", " ('urbana', 228),\n", " ('drug', 228),\n", " ('lord', 227),\n", " ('patrick', 227),\n", " ('lloyd', 226),\n", " ('seattle', 226),\n", " ('planning', 226),\n", " ('houston', 226),\n", " ('heart', 226),\n", " ('paper', 225),\n", " ('energy', 225),\n", " ('plantation', 225),\n", " ('change', 224),\n", " ('systems', 224),\n", " ('susan', 224),\n", " ('audio', 223),\n", " ('fred', 223),\n", " ('eugene', 222),\n", " ('bob', 222),\n", " ('academy', 222),\n", " ('light', 222),\n", " ('colleges', 222),\n", " ('annual', 221),\n", " ('record', 220),\n", " ('committee', 220),\n", " ('haven', 220),\n", " ('moore', 219),\n", " ('sanborn', 219),\n", " ('disease', 219),\n", " ('bureau', 219),\n", " ('ships', 219),\n", " ('eacute', 219),\n", " ('bomb', 219),\n", " ('sugar', 218),\n", " ('racing', 218),\n", " ('motion', 218),\n", " ('sculpture', 218),\n", " ('botany', 218),\n", " ('cooking', 217),\n", " ('manuals', 217),\n", " ('flight', 217),\n", " ('cemetery', 216),\n", " ('commission', 216),\n", " ('making', 216),\n", " ('tn', 215),\n", " ('wars', 215),\n", " ('period', 215),\n", " ('kill', 215),\n", " ('ocean', 215),\n", " ('9', 215),\n", " ('chester', 215),\n", " ('cultural', 215),\n", " ('campbell', 215),\n", " ('karl', 214),\n", " ('three', 214),\n", " ('1968', 214),\n", " ('marie', 214),\n", " ('olympics', 214),\n", " ('dickinson', 213),\n", " ('methodist', 213),\n", " ('wall', 213),\n", " ('soldier', 213),\n", " ('money', 213),\n", " ('cook', 213),\n", " ('org', 213),\n", " ('montgomery', 213),\n", " ('store', 212),\n", " ('lost', 212),\n", " ('trials', 212),\n", " ('latin', 212),\n", " ('amendment', 211),\n", " ('holmes', 211),\n", " ('drawings', 211),\n", " ('costume', 211),\n", " ('bank', 211),\n", " ('funeral', 210),\n", " ('anatomy', 210),\n", " ('survey', 210),\n", " ('encyclopedia', 210),\n", " ('1890', 209),\n", " ('segregation', 209),\n", " ('maria', 209),\n", " ('wayne', 209),\n", " ('baker', 209),\n", " ('beverly', 209),\n", " ('environmental', 208),\n", " ('ralph', 208),\n", " ('denver', 208),\n", " ('mountains', 208),\n", " ('postcards', 208),\n", " ('nude', 208),\n", " ('atlantic', 208),\n", " ('grammar', 208),\n", " ('dust', 207),\n", " ('mitchell', 207),\n", " ('1850', 207),\n", " ('1870', 206),\n", " ('mo', 206),\n", " ('gettysburg', 206),\n", " ('show', 206),\n", " ('di', 206),\n", " ('mental', 206),\n", " ('ben', 206),\n", " ('poland', 205),\n", " ('flowers', 205),\n", " ('atomic', 205),\n", " ('einstein', 205),\n", " ('harold', 205),\n", " ('web', 205),\n", " ('therapy', 204),\n", " ('register', 204),\n", " ('september', 204),\n", " ('1964', 204),\n", " ('truman', 204),\n", " ('greenville', 204),\n", " ('series', 204),\n", " ('graham', 204),\n", " ('1980', 204),\n", " ('crisis', 204),\n", " ('property', 204),\n", " ('en', 203),\n", " ('tx', 203),\n", " ('juan', 203),\n", " ('gun', 203),\n", " ('evolution', 203),\n", " ('www', 202),\n", " ('northern', 202),\n", " ('1912', 202),\n", " ('4', 201),\n", " ('newport', 201),\n", " ('joyce', 201),\n", " ('printing', 201),\n", " ('summer', 201),\n", " ('prison', 201),\n", " ('tobacco', 201),\n", " ('7', 200),\n", " ('simon', 200),\n", " ('reno', 199),\n", " ('clinton', 199),\n", " ('ward', 199),\n", " ('services', 199),\n", " ...]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#>>> fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())\n", "fd = nltk.FreqDist(token.lower() for token in searfilt)\n", "fd.most_common()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('virginia', 166709),\n", " ('university', 107608),\n", " ('library', 64289),\n", " ('charlottesville', 54329),\n", " ('lib', 40564),\n", " ('image', 35858),\n", " ('va', 35396),\n", " ('holsinger', 31644),\n", " ('studio', 31459),\n", " ('collection', 30225),\n", " ('states', 29421),\n", " ('uva', 29228),\n", " ('collections', 26683),\n", " ('special', 23382),\n", " ('negatives', 22543),\n", " ('visual', 22459),\n", " ('history', 22162),\n", " ('ca', 21622),\n", " ('material', 21482),\n", " ('use', 17953),\n", " ('united', 17390),\n", " ('information', 15949),\n", " ('go', 15910),\n", " ('please', 15897),\n", " ('search', 15895),\n", " ('edu', 15894),\n", " ('terms', 15894),\n", " ('http', 15894),\n", " ('regions', 12101),\n", " ('w', 12006),\n", " ('name', 11945),\n", " ('davis', 11606),\n", " ('photographs', 11564),\n", " ('jackson', 11421),\n", " ('1825', 11044),\n", " ('present', 10993),\n", " ('online', 10977),\n", " ('1890', 10538),\n", " ('1938', 10459),\n", " ('photographic', 10386),\n", " ('index', 10378),\n", " ('plus', 10377),\n", " ('volume', 10376),\n", " ('1930', 9763),\n", " ('1866', 9685),\n", " ('rufus', 9632),\n", " ('glass', 9520),\n", " ('african', 9456),\n", " ('portraits', 9377),\n", " ('visitors', 8915),\n", " ('may', 8908),\n", " ('must', 8871),\n", " ('without', 8871),\n", " ('reproduced', 8864),\n", " ('rector', 8864),\n", " ('permission', 8863),\n", " ('additional', 8863),\n", " ('credited', 8863),\n", " ('plate', 8485),\n", " ('white', 7954),\n", " ('good', 7663),\n", " ('black', 7652),\n", " ('condition', 7646),\n", " ('works', 6794),\n", " ('group', 6386),\n", " ('people', 6351),\n", " ('built', 6346),\n", " ('single', 6266),\n", " ('american', 6206),\n", " ('1882', 5640),\n", " ('1947', 5617),\n", " ('5x7', 5607),\n", " ('photography', 5528),\n", " ('restrictions', 5518),\n", " ('accessing', 5518),\n", " ('county', 5107),\n", " ('music', 5090),\n", " ('south', 4872),\n", " ('function', 4839),\n", " ('8x10', 4259),\n", " ('u', 4232),\n", " ('1915', 3833),\n", " ('text', 3699),\n", " ('ethnic', 3422),\n", " ('school', 3413),\n", " ('americans', 3335),\n", " ('record', 3332),\n", " ('digital', 3319),\n", " ('dpla', 3317),\n", " ('print', 2968),\n", " ('piano', 2898),\n", " ('towns', 2790),\n", " ('cities', 2790),\n", " ('schools', 2741),\n", " ('c', 2713),\n", " ('notated', 2549),\n", " ('contact', 2338),\n", " ('n', 2143),\n", " ('children', 2103),\n", " ('unknown', 2090),\n", " ('new', 2067),\n", " ('conditions', 2014),\n", " ('english', 1922),\n", " ('buildings', 1877),\n", " ('1917', 1818),\n", " ('date', 1794),\n", " ('03', 1768),\n", " ('york', 1738),\n", " ('untitled', 1689),\n", " ('07', 1628),\n", " ('09', 1616),\n", " ('11', 1609),\n", " ('prints', 1600),\n", " ('04', 1572),\n", " ('songs', 1549),\n", " ('j', 1526),\n", " ('12', 1508),\n", " ('05', 1497),\n", " ('1916', 1486),\n", " ('01', 1475),\n", " ('1918', 1474),\n", " ('10', 1454),\n", " ('08', 1441),\n", " ('1914', 1438),\n", " ('mrs', 1426),\n", " ('06', 1386),\n", " ('popular', 1335),\n", " ('architecture', 1329),\n", " ('miss', 1269),\n", " ('elements', 1259),\n", " ('h', 1238),\n", " ('02', 1193),\n", " ('co', 1169),\n", " ('e', 1161),\n", " ('st', 1160),\n", " ('students', 1129),\n", " ('richard', 1110),\n", " ('spaces', 1107),\n", " ('1912', 1077),\n", " ('gender', 1040),\n", " ('education', 1036),\n", " ('emulsion', 1032),\n", " ('john', 1009),\n", " ('content', 1006),\n", " ('linguistic', 1004),\n", " ('rotunda', 965),\n", " ('anderson', 954),\n", " ('1940', 942),\n", " ('f', 929),\n", " ('ralph', 910),\n", " ('training', 906),\n", " ('sciences', 890),\n", " ('social', 880),\n", " ('film', 877),\n", " ('unidentified', 864),\n", " ('b', 860),\n", " ('boston', 858),\n", " ('g', 853),\n", " ('carolina', 845),\n", " ('d', 837),\n", " ('1913', 824),\n", " ('m', 823),\n", " ('l', 817),\n", " ('trees', 816),\n", " ('overall', 800),\n", " ('william', 789),\n", " ('hall', 788),\n", " ('institutional', 778),\n", " ('africa', 768),\n", " ('babies', 764),\n", " ('philadelphia', 743),\n", " ('subject', 733),\n", " ('1919', 732),\n", " ('r', 728),\n", " ('open', 709),\n", " ('charles', 701),\n", " ('football', 697),\n", " ('college', 681),\n", " ('site', 672),\n", " ('men', 671),\n", " ('portrait', 670),\n", " ('army', 648),\n", " ('george', 642),\n", " ('houses', 640),\n", " ('colleges', 638),\n", " ('sports', 635),\n", " ('damage', 634),\n", " ('faculty', 632),\n", " ('industrial', 618),\n", " ('hand', 616),\n", " ('tex', 614),\n", " ('de', 612),\n", " ('war', 610),\n", " ('world', 608),\n", " ('house', 608),\n", " ('institute', 566),\n", " ('domestic', 561),\n", " ('18', 543),\n", " ('alderman', 538),\n", " ('players', 533),\n", " ('building', 531),\n", " ('soldiers', 525),\n", " ('landscape', 524),\n", " ('age', 519),\n", " ('uniforms', 519),\n", " ('shrubs', 516),\n", " ('mr', 506),\n", " ('rooms', 506),\n", " ('fraternity', 503),\n", " ('right', 499),\n", " ('furniture', 488),\n", " ('form', 485),\n", " ('land', 479),\n", " ('silver', 477),\n", " ('16', 476),\n", " ('1920', 476),\n", " ('photo', 474),\n", " ('gelatin', 470),\n", " ('interior', 467),\n", " ('thomas', 466),\n", " ('21', 465),\n", " ('left', 462),\n", " ('north', 461),\n", " ('washington', 453),\n", " ('colored', 452),\n", " ('instrumental', 452),\n", " ('hats', 451),\n", " ('p', 445),\n", " ('lawn', 441),\n", " ('james', 437),\n", " ('type', 433),\n", " ('types', 430),\n", " ('east', 429),\n", " ('24', 420),\n", " ('henry', 417),\n", " ('la', 417),\n", " ('dr', 410),\n", " ('ga', 407),\n", " ('women', 401),\n", " ('26', 401),\n", " ('14', 397),\n", " ('engravings', 394),\n", " ('middle', 394),\n", " ('17', 390),\n", " ('along', 389),\n", " ('continents', 388),\n", " ('20', 386),\n", " ('high', 382),\n", " ('architectural', 378),\n", " ('15', 376),\n", " ('13', 376),\n", " ('summer', 376),\n", " ('1921', 374),\n", " ('texas', 373),\n", " ('elizabeth', 372),\n", " ('state', 371),\n", " ('27', 370),\n", " ('old', 370),\n", " ('vehicles', 367),\n", " ('events', 367),\n", " ('bottom', 366),\n", " ('west', 366),\n", " ('city', 365),\n", " ('cartographic', 365),\n", " ('y', 363),\n", " ('albemarle', 361),\n", " ('25', 356),\n", " ('22', 356),\n", " ('19', 355),\n", " ('family', 355),\n", " ('materials', 353),\n", " ('automobiles', 352),\n", " ('o', 351),\n", " ('context', 350),\n", " ('animals', 349),\n", " ('28', 347),\n", " ('1924', 345),\n", " ('corner', 344),\n", " ('side', 343),\n", " ('smith', 341),\n", " ('29', 340),\n", " ('robert', 340),\n", " ('30', 337),\n", " ('baltimore', 337),\n", " ('two', 334),\n", " ('11x14', 332),\n", " ('costume', 331),\n", " ('agricultural', 330),\n", " ('ala', 326),\n", " ('home', 325),\n", " ('horses', 324),\n", " ('landforms', 322),\n", " ('church', 321),\n", " ('voice', 319),\n", " ('countries', 318),\n", " ('gardens', 318),\n", " ('occupation', 315),\n", " ('jr', 313),\n", " ('french', 312),\n", " ('union', 310),\n", " ('23', 309),\n", " ('academy', 307),\n", " ('teachers', 307),\n", " ('1974', 307),\n", " ('ark', 306),\n", " ('ditson', 298),\n", " ('evening', 297),\n", " ('1891', 297),\n", " ('vocal', 296),\n", " ('landscapes', 295),\n", " ('view', 292),\n", " ('company', 291),\n", " ('mississippi', 291),\n", " ('negro', 285),\n", " ('edward', 284),\n", " ('oliver', 280),\n", " ('components', 280),\n", " ('girls', 279),\n", " ('families', 278),\n", " ('gowns', 278),\n", " ('top', 275),\n", " ('map', 274),\n", " ('1925', 272),\n", " ('construction', 267),\n", " ('baseball', 267),\n", " ('saint', 266),\n", " ('visible', 266),\n", " ('views', 264),\n", " ('caroline', 263),\n", " ('universities', 263),\n", " ('tenn', 263),\n", " ('color', 263),\n", " ('schirmer', 263),\n", " ('activity', 258),\n", " ('sq', 258),\n", " ('missing', 257),\n", " ('chestnut', 257),\n", " ('general', 257),\n", " ('georgia', 257),\n", " ('structural', 255),\n", " ('joseph', 251),\n", " ('relationship', 250),\n", " ('4', 249),\n", " ('bettis', 249),\n", " ('water', 249),\n", " ('mary', 245),\n", " ('france', 244),\n", " ('taylor', 242),\n", " ('edgefield', 241),\n", " ('trenton', 239),\n", " ('normal', 238),\n", " ('roads', 238),\n", " ('field', 237),\n", " ('1895', 236),\n", " ('class', 235),\n", " ('street', 233),\n", " ('penn', 232),\n", " ('arranged', 232),\n", " ('dwellings', 231),\n", " ('alabama', 231),\n", " ('walker', 230),\n", " ('preachers', 228),\n", " ('1972', 226),\n", " ('broadway', 226),\n", " ('story', 225),\n", " ('1977', 224),\n", " ('photogravures', 223),\n", " ('31', 221),\n", " ('lee', 221),\n", " ('railroads', 220),\n", " ('pageants', 219),\n", " ('hench', 218),\n", " ('laughlin', 217),\n", " ('cumberland', 217),\n", " ('atcheson', 217),\n", " ('1922', 216),\n", " ('1928', 215),\n", " ('operas', 210),\n", " ('brothers', 209),\n", " ('session', 208),\n", " ('clothes', 207),\n", " ('costumes', 207),\n", " ('1923', 206),\n", " ('arkansas', 203),\n", " ('room', 203),\n", " ('louis', 202),\n", " ('couples', 202),\n", " ('henrico', 201),\n", " ('jefferson', 200),\n", " ('frank', 199),\n", " ('specific', 198),\n", " ('waltzes', 197),\n", " ('farms', 196),\n", " ('europe', 196),\n", " ('districts', 196),\n", " ('snow', 196),\n", " ('voices', 195),\n", " ('flowers', 195),\n", " ('paul', 192),\n", " ('richmond', 190),\n", " ('equipment', 189),\n", " ('boxing', 188),\n", " ('farm', 187),\n", " ('near', 187),\n", " ('alexander', 187),\n", " ('railroad', 187),\n", " ('arthur', 187),\n", " ('1970', 186),\n", " ('fla', 186),\n", " ('finals', 185),\n", " ('parish', 184),\n", " ('paris', 183),\n", " ('choruses', 182),\n", " ('pavilion', 181),\n", " ('sir', 179),\n", " ('edwin', 179),\n", " ('hill', 179),\n", " ('pa', 179),\n", " ('president', 179),\n", " ('ruth', 177),\n", " ('monticello', 175),\n", " ('fields', 175),\n", " ('mixed', 174),\n", " ('14x17', 173),\n", " ('delta', 171),\n", " ('warner', 171),\n", " ('upper', 170),\n", " ('track', 170),\n", " ('tennessee', 169),\n", " ('team', 169),\n", " ('horse', 169),\n", " ('cabell', 168),\n", " ('1966', 167),\n", " ('porches', 166),\n", " ('1969', 166),\n", " ('1896', 166),\n", " ('1889', 165),\n", " ('one', 165),\n", " ('king', 161),\n", " ('hampton', 161),\n", " ('exhibits', 160),\n", " ('party', 160),\n", " ('club', 160),\n", " ('island', 159),\n", " ('helena', 159),\n", " ('central', 159),\n", " ('mass', 158),\n", " ('cultural', 158),\n", " ('worn', 158),\n", " ('phi', 157),\n", " ('hospital', 156),\n", " ('1858', 155),\n", " ('mammals', 154),\n", " ('35', 154),\n", " ('sacred', 154),\n", " ('composition', 152),\n", " ('physical', 152),\n", " ('aerial', 152),\n", " ('gloucester', 151),\n", " ('sweet', 151),\n", " ('railway', 151),\n", " ('center', 150),\n", " ('chicago', 150),\n", " ('boys', 150),\n", " ('1885', 150),\n", " ('1968', 150),\n", " ('stores', 150),\n", " ('engineering', 149),\n", " ('1819', 148),\n", " ('ohio', 148),\n", " ('plants', 148),\n", " ('medical', 146),\n", " ('love', 146),\n", " ('conference', 145),\n", " ('1870', 144),\n", " ('sigma', 144),\n", " ('business', 144),\n", " ('secular', 142),\n", " ('dance', 142),\n", " ('day', 141),\n", " ('personal', 141),\n", " ('man', 141),\n", " ('parts', 141),\n", " ('1911', 140),\n", " ('main', 140),\n", " ('1937', 140),\n", " ('chesterfield', 139),\n", " ('1875', 139),\n", " ('1976', 139),\n", " ('1929', 139),\n", " ('louisiana', 138),\n", " ('1830', 138),\n", " ('weddings', 138),\n", " ('exhibitions', 136),\n", " ('florida', 136),\n", " ('brown', 134),\n", " ('public', 134),\n", " ('scott', 134),\n", " ('techniques', 133),\n", " ('range', 133),\n", " ('mountains', 133),\n", " ('pageant', 133),\n", " ('processes', 133),\n", " ('head', 132),\n", " ('1900', 132),\n", " ('1863', 131),\n", " ('1872', 131),\n", " ('scenes', 131),\n", " ('saalfield', 131),\n", " ('southern', 130),\n", " ('rural', 130),\n", " ('beaufort', 130),\n", " ('bodies', 129),\n", " ('albert', 129),\n", " ('deteriorating', 129),\n", " ('1971', 129),\n", " ('cincinnati', 128),\n", " ('basketball', 128),\n", " ('allen', 128),\n", " ('activities', 128),\n", " ('point', 127),\n", " ('africans', 127),\n", " ('helen', 127),\n", " ('1905', 126),\n", " ('accessories', 126),\n", " ('religion', 125),\n", " ('md', 125),\n", " ('vocational', 124),\n", " ('1813', 124),\n", " ('london', 124),\n", " ('retail', 124),\n", " ('1975', 124),\n", " ('1950', 124),\n", " ('brides', 124),\n", " ('slight', 123),\n", " ('graded', 123),\n", " ('1927', 123),\n", " ('score', 122),\n", " ('1850', 121),\n", " ('england', 121),\n", " ('medium', 121),\n", " ('1935', 121),\n", " ('green', 120),\n", " ('wood', 120),\n", " ('willig', 120),\n", " ('superintendent', 120),\n", " ('1926', 119),\n", " ('concepts', 118),\n", " ('dormitory', 118),\n", " ('grounds', 118),\n", " ('1839', 118),\n", " ('low', 118),\n", " ('dogs', 117),\n", " ('student', 116),\n", " ('memorial', 116),\n", " ('entirely', 116),\n", " ('francis', 116),\n", " ('mountain', 116),\n", " ('1809', 116),\n", " ('clark', 115),\n", " ('clergy', 114),\n", " ('martin', 114),\n", " ('none', 114),\n", " ('kenya', 114),\n", " ('1856', 113),\n", " ('1876', 113),\n", " ('gordon', 113),\n", " ('randolph', 113),\n", " ('1892', 113),\n", " ('1964', 113),\n", " ('webb', 112),\n", " ('edgar', 112),\n", " ('2', 112),\n", " ('arr', 112),\n", " ('1859', 111),\n", " ('1848', 111),\n", " ('wilson', 111),\n", " ('broken', 111),\n", " ('orleans', 110),\n", " ('daughters', 110),\n", " ('1910', 110),\n", " ('duets', 109),\n", " ('du', 108),\n", " ('chairs', 108),\n", " ('1826', 108),\n", " ('fourth', 108),\n", " ('statues', 108),\n", " ('1901', 108),\n", " ('harry', 107),\n", " ('1967', 107),\n", " ('camp', 106),\n", " ('k', 105),\n", " ('exhibit', 105),\n", " ('walter', 105),\n", " ('chi', 105),\n", " ('statue', 105),\n", " ('nottoway', 104),\n", " ('fences', 104),\n", " ('coats', 103),\n", " ('kappa', 103),\n", " ('road', 103),\n", " ('briar', 103),\n", " ('1963', 103),\n", " ('harrison', 103),\n", " ('harris', 102),\n", " ('1909', 102),\n", " ('1874', 102),\n", " ('marion', 102),\n", " ('steps', 102),\n", " ('bowling', 102),\n", " ('edge', 102),\n", " ('franz', 101),\n", " ('1984', 100),\n", " ('v', 100),\n", " ('1936', 100),\n", " ('dormitories', 100),\n", " ('food', 99),\n", " ('walls', 99),\n", " ('1941', 99),\n", " ('1827', 99),\n", " ('1600', 99),\n", " ('1880', 99),\n", " ('1852', 98),\n", " ('veils', 98),\n", " ('natural', 98),\n", " ('1797', 97),\n", " ('residence', 97),\n", " ('1833', 97),\n", " ('1865', 97),\n", " ('chapel', 97),\n", " ('garden', 97),\n", " ('fur', 97),\n", " ('von', 96),\n", " ('station', 96),\n", " ('1828', 96),\n", " ('joining', 96),\n", " ('areas', 96),\n", " ('graduation', 95),\n", " ('additive', 95),\n", " ('1400', 95),\n", " ('1861', 94),\n", " ('garments', 94),\n", " ('samuel', 94),\n", " ('toys', 94),\n", " ('frederick', 94),\n", " ('office', 93),\n", " ('quartets', 92),\n", " ('1867', 92),\n", " ('geographic', 92),\n", " ('organ', 92),\n", " ('frogmore', 92),\n", " ('1939', 92),\n", " ('carter', 91),\n", " ('1980', 91),\n", " ('containers', 91),\n", " ('1877', 91),\n", " ('1871', 91),\n", " ('sea', 91),\n", " ('work', 91),\n", " ('national', 91),\n", " ('brick', 91),\n", " ('morris', 90),\n", " ('jones', 90),\n", " ('sonatas', 90),\n", " ('peters', 90),\n", " ('campbell', 90),\n", " ('academic', 90),\n", " ('chipped', 90),\n", " ('son', 89),\n", " ('dillard', 89),\n", " ('1869', 89),\n", " ('1986', 89),\n", " ('bridge', 89),\n", " ('marches', 89),\n", " ('books', 89),\n", " ('madison', 89),\n", " ('1965', 89),\n", " ('across', 89),\n", " ('beta', 88),\n", " ('1979', 88),\n", " ('showing', 88),\n", " ('makers', 88),\n", " ('nurses', 88),\n", " ('monroe', 88),\n", " ('park', 87),\n", " ('1860', 87),\n", " ('river', 87),\n", " ('david', 87),\n", " ('alfred', 87),\n", " ('277', 87),\n", " ('front', 87),\n", " ('mount', 87),\n", " ('okla', 86),\n", " ('liberia', 86),\n", " ('1987', 86),\n", " ('montgomery', 86),\n", " ('marshall', 85),\n", " ('board', 85),\n", " ('1899', 85),\n", " ('1945', 84),\n", " ('alumni', 84),\n", " ('ceremonies', 84),\n", " ('albumen', 84),\n", " ('meeting', 84),\n", " ('corinne', 84),\n", " ('1832', 84),\n", " ('des', 83),\n", " ('stingray', 83),\n", " ('1837', 83),\n", " ('cleveland', 83),\n", " ('lewis', 82),\n", " ('1857', 82),\n", " ('churches', 82),\n", " ('red', 82),\n", " ('little', 82),\n", " ('ave', 82),\n", " ('carl', 82),\n", " ('sumter', 82),\n", " ('middlesex', 82),\n", " ('1883', 82),\n", " ('1847', 82),\n", " ('miller', 82),\n", " ('1888', 81),\n", " ('1907', 81),\n", " ('lawns', 81),\n", " ('obscuring', 81),\n", " ('yards', 81),\n", " ('jean', 81),\n", " ('laboratory', 81),\n", " ('1849', 81),\n", " ('consumer', 80),\n", " ('1862', 80),\n", " ('le', 80),\n", " ('et', 80),\n", " ('art', 80),\n", " ('court', 80),\n", " ('woman', 79),\n", " ('machinery', 79),\n", " ('hands', 79),\n", " ('groups', 79),\n", " ('1897', 79),\n", " ('oh', 78),\n", " ('bowles', 78),\n", " ('artifacts', 78),\n", " ('halifax', 78),\n", " ('1864', 77),\n", " ('williams', 77),\n", " ('ky', 77),\n", " ('department', 77),\n", " ('pond', 76),\n", " ('1973', 76),\n", " ('stations', 76),\n", " ('1829', 76),\n", " ('1886', 76),\n", " ('process', 76),\n", " ('1308', 76),\n", " ('1893', 76),\n", " ('rosenwald', 75),\n", " ('german', 75),\n", " ('military', 75),\n", " ('night', 75),\n", " ('minor', 75),\n", " ('tables', 74),\n", " ('anne', 74),\n", " ('player', 74),\n", " ('1810', 73),\n", " ('serpentine', 73),\n", " ('country', 73),\n", " ('small', 73),\n", " ('fair', 73),\n", " ('society', 73),\n", " ('frame', 72),\n", " ('1978', 72),\n", " ('peter', 72),\n", " ('dining', 72),\n", " ('1835', 72),\n", " ('watercraft', 72),\n", " ('prince', 72),\n", " ('conferences', 71),\n", " ('1855', 71),\n", " ('wm', 71),\n", " ('christian', 71),\n", " ('johnson', 71),\n", " ('lawrenceville', 71),\n", " ('1842', 71),\n", " ('1868', 71),\n", " ('oak', 71),\n", " ('game', 71),\n", " ('large', 71),\n", " ('stadiums', 71),\n", " ('van', 71),\n", " ('navy', 71),\n", " ('1846', 71),\n", " ('1840', 71),\n", " ('professor', 71),\n", " ('islands', 70),\n", " ('lyon', 70),\n", " ('rock', 70),\n", " ('forest', 70),\n", " ('life', 70),\n", " ('stephen', 69),\n", " ('1985', 69),\n", " ('location', 69),\n", " ('ii', 69),\n", " ('queen', 69),\n", " ('fruit', 69),\n", " ('russell', 69),\n", " ('berry', 69),\n", " ('shelby', 69),\n", " ('scene', 68),\n", " ('41', 68),\n", " ('drawn', 68),\n", " ('1884', 68),\n", " ('frederic', 68),\n", " ('1887', 68),\n", " ('warren', 68),\n", " ('clubs', 68),\n", " ('held', 68),\n", " ('ridge', 68),\n", " ('visit', 67),\n", " ('norfolk', 67),\n", " ('gillingham', 67),\n", " ('1908', 67),\n", " ('1903', 67),\n", " ('1873', 67),\n", " ('adams', 67),\n", " ('government', 67),\n", " ('1807', 67),\n", " ('apples', 66),\n", " ('rogers', 66),\n", " ('mostly', 66),\n", " ('accelerator', 66),\n", " ('1981', 66),\n", " ('poe', 66),\n", " ('parties', 66),\n", " ('1841', 66),\n", " ('grove', 66),\n", " ('brunswick', 65),\n", " ('stone', 65),\n", " ('1949', 65),\n", " ('1982', 65),\n", " ('law', 65),\n", " ('1962', 65),\n", " ('july', 65),\n", " ('young', 65),\n", " ('athletes', 65),\n", " ('wives', 65),\n", " ('plant', 65),\n", " ('association', 65),\n", " ('1808', 65),\n", " ('windows', 65),\n", " ('printed', 65),\n", " ('blue', 65),\n", " ('michael', 64),\n", " ('crowds', 64),\n", " ('societies', 64),\n", " ('trip', 64),\n", " ('chesapeake', 64),\n", " ('1934', 64),\n", " ('1844', 64),\n", " ('carriages', 64),\n", " ('clarke', 64),\n", " ('1812', 64),\n", " ('c1866', 64),\n", " ('trio', 63),\n", " ('continuing', 63),\n", " ('oklahoma', 63),\n", " ('america', 63),\n", " ('theodore', 63),\n", " ('1818', 62),\n", " ('sons', 62),\n", " ('horsemanship', 62),\n", " ('published', 62),\n", " ('fayette', 62),\n", " ('shannon', 62),\n", " ('since', 62),\n", " ('maria', 62),\n", " ('cookery', 62),\n", " ('manassas', 62),\n", " ('canada', 62),\n", " ('1823', 62),\n", " ('log', 62),\n", " ('rivers', 62),\n", " ('drawing', 62),\n", " ('tuskegee', 62),\n", " ('schmidt', 61),\n", " ('settlements', 61),\n", " ('mark', 61),\n", " ('tape', 61),\n", " ('1831', 61),\n", " ('1836', 61),\n", " ('funeral', 61),\n", " ('bible', 61),\n", " ('boxer', 60),\n", " ('affairs', 60),\n", " ('charlotte', 60),\n", " ('451', 60),\n", " ('service', 60),\n", " ('complexes', 60),\n", " ('weber', 60),\n", " ('hancock', 60),\n", " ('valley', 60),\n", " ('1898', 60),\n", " ('canning', 60),\n", " ('science', 59),\n", " ('98', 59),\n", " ('c1889', 59),\n", " ('tyler', 59),\n", " ('1786', 59),\n", " ('1983', 59),\n", " ('underwood', 59),\n", " ('lower', 59),\n", " ('orchards', 59),\n", " ('moore', 59),\n", " ('1851', 58),\n", " ('benjamin', 58),\n", " ('shakespeare', 58),\n", " ('duke', 58),\n", " ('1988', 58),\n", " ('herbaceous', 58),\n", " ('birds', 58),\n", " ('1820', 58),\n", " ('trade', 58),\n", " ('sewing', 58),\n", " ('first', 58),\n", " ('roofs', 58),\n", " ('area', 58),\n", " ('alpha', 58),\n", " ('1853', 57),\n", " ('rouge', 57),\n", " ('woodberry', 57),\n", " ('biology', 57),\n", " ('classrooms', 57),\n", " ('teacher', 57),\n", " ('canned', 57),\n", " ('figures', 57),\n", " ('painting', 57),\n", " ('1834', 57),\n", " ('maryland', 57),\n", " ('cracked', 57),\n", " ('edwards', 57),\n", " ('howard', 57),\n", " ('crb', 56),\n", " ('1817', 56),\n", " ('picture', 56),\n", " ('travis', 56),\n", " ('evans', 56),\n", " ('1933', 56),\n", " ('1932', 56),\n", " ('half', 56),\n", " ('1894', 56),\n", " ('1843', 56),\n", " ('cross', 56),\n", " ('ash', 56),\n", " ('par', 56),\n", " ('earl', 55),\n", " ('epsilon', 55),\n", " ('1854', 55),\n", " ('baton', 55),\n", " ('fire', 55),\n", " ('theta', 55),\n", " ('guadalupe', 55),\n", " ('living', 55),\n", " ('robinson', 55),\n", " ('slightly', 55),\n", " ('1806', 55),\n", " ('clay', 55),\n", " ('dean', 55),\n", " ('show', 55),\n", " ('approximately', 54),\n", " ('greene', 54),\n", " ('relief', 54),\n", " ('graduate', 54),\n", " ('1779', 54),\n", " ('swain', 54),\n", " ('mayo', 54),\n", " ('williamsburg', 54),\n", " ('transparencies', 54),\n", " ('c1882', 53),\n", " ('1953', 53),\n", " ('c1883', 53),\n", " ('x', 53),\n", " ('real', 53),\n", " ('violins', 53),\n", " ('come', 53),\n", " ('hereford', 53),\n", " ('great', 53),\n", " ('5', 53),\n", " ('environments', 53),\n", " ('extending', 53),\n", " ('lake', 53),\n", " ('years', 53),\n", " ('gymnasium', 53),\n", " ('cabins', 53),\n", " ('thornton', 52),\n", " ('1815', 52),\n", " ('louisville', 52),\n", " ('1811', 52),\n", " ('daniel', 52),\n", " ('c1887', 52),\n", " ...]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pickle\n", "import nltk\n", "\n", "vap = pickle.load( open( \"/media/storage/dpla-data/pickles/virginia.p\", \"rb\" ) )\n", "\n", "### >>> fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())\n", "vafd = nltk.FreqDist(token.lower() for token in vap['virginia']['filtered'])\n", "vafd.most_common()\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Gathering Stats for artstor\n", "{'fwc': 6518566, 'wc': 8466030, 'funiq': 60635, 'uniq': 60760}\n", "percent unique:\n", "0.00718 %\n", "filtered percent unique:\n", "0.0093 %\n", "*********\n", "\n", "Gathering Stats for biodiv\n", "{'fwc': 7638579, 'wc': 8361216, 'funiq': 94631, 'uniq': 94755}\n", "percent unique:\n", "0.01133 %\n", "filtered percent unique:\n", "0.01239 %\n", "*********\n", "\n", "Gathering Stats for rumsey\n", "{'fwc': 12562369, 'wc': 15404401, 'funiq': 47643, 'uniq': 47763}\n", "percent unique:\n", "0.0031 %\n", "filtered percent unique:\n", "0.00379 %\n", "*********\n", "\n", "Gathering Stats for commonwealth" ] } ], "source": [ "colls = [\"artstor\",\"biodiv\",\"rumsey\",\"commonwealth\",\"georgia\",\"harvard\",\n", " \"ia\",\"getty\",\"kentucky\",\"minnesota\",\"missouri\",\"mwdl\",\n", " \"nara\",\"nocar\",\"smiths\",\"socar\",\"texas\",\"gpo\",\"illinois\",\"usc\",\"virginia\",\"nocoll\"]\n", "\n", "import pickle\n", "\n", "for c in colls:\n", " #p = pickle.load( open( \"/media/storage/dpla-data/pickles/\"+c+\".p\", \"rb\" ) )\n", " p = pickle.load( open( \"C:/Users/charper/dpla-temp/pickles/\"+c+\".p\", \"rb\" ) )\n", " print(\"\\nGathering Stats for \" + c)\n", " stats = p[c]['stats']\n", " print(stats)\n", " print(\"percent unique:\")\n", " print(round((p[c]['stats']['uniq'] / p[c]['stats']['wc']),5), \"%\") \n", " print(\"filtered percent unique:\")\n", " print(round((p[c]['stats']['funiq'] / p[c]['stats']['fwc']),5), \"%\")\n", " print(\"*********\")\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "31738" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type(fd)\n", "#haps = fd.hapaxes()\n", "len(vafd.hapaxes())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "longwords = {}\n", "for k,v in vafd.items():\n", " if len(k) > 10: longwords[k] = v" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'Accelerator': 65,\n", " 'Accessories': 120,\n", " 'Accomodates': 2,\n", " 'Accompanying': 1,\n", " 'Adjustments': 1,\n", " 'Administration': 2,\n", " 'Administrative': 13,\n", " 'Adminstration': 1,\n", " 'Aduertissement': 1,\n", " 'Advertisement': 2,\n", " 'Advertising': 5,\n", " 'Aerodynamics': 2,\n", " 'Aeronautical': 5,\n", " 'Aeronautics': 37,\n", " 'Agricultural': 303,\n", " 'Agriculture': 13,\n", " 'Alterations': 1,\n", " 'Amabassador': 1,\n", " 'Amazonenvitt': 1,\n", " 'Amphitheater': 2,\n", " 'Amphitheaters': 9,\n", " 'Amphitheatre': 43,\n", " 'Ampitheatre': 4,\n", " 'Amstelodami': 1,\n", " 'Anniversaries': 5,\n", " 'Anniversary': 22,\n", " 'Announcement': 1,\n", " 'Antiquities': 2,\n", " 'Apologetics': 1,\n", " 'Apparizione': 1,\n", " 'Appointment': 1,\n", " 'Archdeacons': 1,\n", " 'Architectural': 313,\n", " 'Architecture': 1012,\n", " 'Aristomenes': 1,\n", " 'Arrangements': 1,\n", " 'Assassination': 2,\n", " 'Association': 64,\n", " 'Associations': 7,\n", " 'Astronomical': 1,\n", " 'Auditoriums': 4,\n", " 'Aufforderung': 4,\n", " 'Automobiles': 351,\n", " 'Automoblies': 1,\n", " 'Badarzewska': 3,\n", " 'Balustrades': 5,\n", " 'Barboursville': 18,\n", " 'Barcarolles': 2,\n", " 'Barhamsville': 12,\n", " 'Bartholomew': 6,\n", " 'Battlements': 1,\n", " 'Baylorsville': 5,\n", " 'Beaujoyeulx': 1,\n", " 'Beckenstein': 3,\n", " 'Beuckenstein': 2,\n", " 'Bibliographical': 1,\n", " 'Bibliopolas': 1,\n", " 'Biscaccianti': 1,\n", " 'Bishopville': 2,\n", " 'Bjørnstjerne': 1,\n", " 'Blackboards': 6,\n", " 'Blacksmiths': 1,\n", " 'Blankenship': 2,\n", " 'Blennerhassett': 1,\n", " 'Blessington': 2,\n", " 'Bloomington': 1,\n", " 'Blumentritt': 1,\n", " 'Bodybuilders': 41,\n", " 'Bodybuilding': 4,\n", " 'Bonaventure': 2,\n", " 'Bonnycastle': 1,\n", " 'Bookbinders': 2,\n", " 'Bookbinding': 2,\n", " 'Bookshelves': 2,\n", " 'Brandeville': 1,\n", " 'Brandstetter': 3,\n", " 'Breckinridge': 2,\n", " 'Bridegrooms': 24,\n", " 'Bridgeforth': 1,\n", " 'Bridgewater': 1,\n", " 'Brockenbraugh': 1,\n", " 'Brownsville': 23,\n", " 'Brueschweiler': 1,\n", " 'Buentivolio': 1,\n", " 'Businessmen': 1,\n", " 'Butterflies': 1,\n", " 'COLLECTIONS': 1,\n", " 'Cadwallader': 1,\n", " 'Calcografia': 1,\n", " 'Calculating': 1,\n", " 'Cannonballs': 1,\n", " 'Canzonettas': 1,\n", " 'Cartersville': 39,\n", " 'Cartographic': 2,\n", " 'Castiglione': 4,\n", " 'Catholiques': 1,\n", " 'Cazouillement': 1,\n", " 'Celebration': 27,\n", " 'Celebrations': 3,\n", " 'Certificate': 2,\n", " 'Certificates': 2,\n", " 'Chamberlain': 9,\n", " 'Chamberlayne': 20,\n", " 'Chancellors': 1,\n", " 'Chandeliers': 2,\n", " 'Charakterstu': 1,\n", " 'Charlemagne': 2,\n", " 'Charlestown': 1,\n", " 'Charlottesviile': 1,\n", " 'Charlottesville': 54329,\n", " 'Chartseller': 1,\n", " 'Chatterbrick': 1,\n", " 'Chesterfield': 139,\n", " 'Christening': 3,\n", " 'Christianity': 2,\n", " 'Christiansburg': 11,\n", " 'Christopher': 14,\n", " 'Churchyards': 1,\n", " 'Clarksville': 2,\n", " 'Classification': 1,\n", " 'Clatterbrick': 1,\n", " 'Clinchfield': 2,\n", " 'Clotheslines': 1,\n", " 'Cobblestone': 1,\n", " 'Collections': 26682,\n", " 'Combination': 1,\n", " 'Commencement': 24,\n", " 'Commencements': 14,\n", " 'Commentaires': 1,\n", " 'Commisioner': 1,\n", " 'Commissioned': 2,\n", " 'Commissioner': 1,\n", " 'Commissioners': 3,\n", " 'Commisssioners': 1,\n", " 'Comparative': 1,\n", " 'Competition': 1,\n", " 'Confederacy': 1,\n", " 'Confederate': 52,\n", " 'Conferences': 71,\n", " 'Confidentially': 1,\n", " 'Confirmation': 1,\n", " 'Confirmations': 1,\n", " 'Congregation': 2,\n", " 'Congressional': 2,\n", " 'Conservatories': 1,\n", " 'Considerations': 1,\n", " 'Consolation': 2,\n", " 'Consolidated': 14,\n", " 'Consolidating': 1,\n", " 'Constantine': 2,\n", " 'Constitution': 2,\n", " 'Constitutional': 1,\n", " 'Construction': 114,\n", " 'Continental': 8,\n", " 'Continuation': 3,\n", " 'Contreblasons': 1,\n", " 'Convenience': 1,\n", " 'Conversation': 1,\n", " 'Convocation': 3,\n", " 'Cooperation': 8,\n", " 'Cooperative': 20,\n", " 'Coopersmith': 1,\n", " 'Coordinated': 1,\n", " 'Cornachione': 1,\n", " 'Coronations': 2,\n", " 'Corporation': 11,\n", " 'Corporations': 1,\n", " 'Corporative': 1,\n", " 'Correctional': 1,\n", " 'Corrections': 1,\n", " 'Correspondence': 1,\n", " 'Corrugating': 5,\n", " 'Cosmographie': 1,\n", " 'Cosstaphney': 1,\n", " 'Courlaender': 1,\n", " 'Courthouses': 32,\n", " 'Cowperthwait': 1,\n", " 'Critchenberger': 2,\n", " 'Critenbarger': 2,\n", " 'Crutchfield': 2,\n", " 'Dardensburg': 1,\n", " 'Declaration': 36,\n", " 'Declaratory': 2,\n", " 'Decorations': 2,\n", " 'Dedications': 1,\n", " 'Demonstation': 1,\n", " 'Demonstration': 32,\n", " 'Demonstrations': 16,\n", " 'Demonstrator': 1,\n", " 'Demonstrators': 1,\n", " 'Departments': 1,\n", " 'Description': 15,\n", " 'Destruction': 1,\n", " 'Development': 12,\n", " 'Diepenbeeck': 1,\n", " 'Discoloration': 3,\n", " 'Dispensations': 2,\n", " 'Disposition': 1,\n", " 'Distinguished': 10,\n", " 'Distributed': 2,\n", " 'Divertissement': 1,\n", " 'Documentary': 3,\n", " 'Dodelinette': 1,\n", " 'Dormitories': 100,\n", " 'Dressmakers': 2,\n", " 'Dressmaking': 1,\n", " 'Dunfermline': 1,\n", " 'Educational': 15,\n", " 'Electricity': 2,\n", " 'Elefterious': 1,\n", " 'Emancipation': 2,\n", " 'Embroidering': 1,\n", " 'Encarnacion': 1,\n", " 'Enchantment': 1,\n", " 'Encyclopaedia': 2,\n", " 'Engineering': 144,\n", " 'Entablatures': 1,\n", " 'Entertainers': 2,\n", " 'Entertainment': 46,\n", " 'Environments': 44,\n", " 'Equestrians': 3,\n", " 'Ergenbright': 2,\n", " 'Establishment': 3,\n", " 'Estudiantina': 1,\n", " 'Eternamente': 1,\n", " 'Euangelistes': 1,\n", " 'Eucharistie': 1,\n", " 'Evangelical': 2,\n", " 'Exhibitions': 135,\n", " 'Experimental': 25,\n", " 'Fairgrounds': 3,\n", " 'Fantasiestu': 3,\n", " 'Faschingsschwank': 1,\n", " 'Fayerweather': 28,\n", " 'Fayetteville': 19,\n", " 'Fenchtenberger': 1,\n", " 'Fieuberlake': 1,\n", " 'Fiskerjenten': 1,\n", " 'Fitzpatrick': 4,\n", " 'Fleetstreet': 8,\n", " 'Folkelivsbilleder': 1,\n", " 'Fontainebleau': 2,\n", " 'Footbridges': 4,\n", " 'Foraarstoner': 1,\n", " 'Fortification': 6,\n", " 'Fortifications': 1,\n", " 'Foundations': 8,\n", " 'Fountainebleau': 1,\n", " 'Fraternities': 39,\n", " 'Fredericksburg': 23,\n", " 'Freemasonry': 8,\n", " 'Freiligrath': 1,\n", " 'Friedenwald': 1,\n", " 'Furnishings': 18,\n", " 'Gainesville': 1,\n", " 'Gainsborough': 1,\n", " 'Gallicanism': 1,\n", " 'Gendarmerie': 1,\n", " 'Generations': 2,\n", " 'Geographical': 2,\n", " 'Giambattista': 1,\n", " 'Gildersleeve': 16,\n", " 'Gillenwater': 3,\n", " 'Goldschmetterlinge': 1,\n", " 'Goldschmidt': 1,\n", " 'Gordonsville': 1,\n", " 'Granddaughter': 1,\n", " 'Grandfather': 3,\n", " 'Grandfathers': 2,\n", " 'Grandmother': 4,\n", " 'Grandmothers': 1,\n", " 'Grandparents': 1,\n", " 'Grandsaigne': 1,\n", " 'Grandstands': 14,\n", " 'Graniteville': 1,\n", " 'Greeleyville': 6,\n", " 'Greenhouses': 2,\n", " 'Greensville': 3,\n", " 'Grigorʹevich': 1,\n", " 'Grillparzer': 3,\n", " 'Groundbreakings': 4,\n", " 'Hairdressing': 4,\n", " 'Hallettsville': 4,\n", " 'Hammarstrand': 2,\n", " 'Hanfstaengl': 1,\n", " 'Harpsichord': 26,\n", " 'Harrisonburg': 8,\n", " 'Hattiesburg': 11,\n", " 'Headquarters': 3,\n", " 'Hecatongraphie': 1,\n", " 'Hendrickson': 2,\n", " 'Heppenheimer': 2,\n", " 'Herrodsburg': 1,\n", " 'Herzallerliebsten': 2,\n", " 'Hetherington': 2,\n", " 'Hippocrates': 2,\n", " 'Hirondelles': 1,\n", " 'Historiarum': 1,\n", " 'Hodgkinsonne': 1,\n", " 'Hollingsworth': 2,\n", " 'Homesickness': 1,\n", " 'Honeysuckle': 1,\n", " 'Horsemanship': 62,\n", " 'Hortenstein': 1,\n", " 'Hortensteine': 1,\n", " 'Horticultural': 3,\n", " 'Horticulture': 7,\n", " 'Humoresques': 2,\n", " 'Hydrographer': 1,\n", " 'Hydrographical': 2,\n", " 'Identifying': 18,\n", " 'Illustrated': 2,\n", " 'Illustrations': 2,\n", " 'Improvement': 11,\n", " 'Improvisations': 1,\n", " 'Inaguration': 1,\n", " 'Inauguration': 17,\n", " 'Incorporated': 2,\n", " 'Independance': 1,\n", " 'Independence': 32,\n", " 'Independent': 6,\n", " 'Indianapolis': 4,\n", " 'Industrialization': 1,\n", " 'Infirmaries': 3,\n", " 'Information': 44,\n", " 'Infrastructural': 9,\n", " 'Infrastructure': 1,\n", " 'Inheritance': 1,\n", " 'Inscriptions': 1,\n", " 'Institution': 1,\n", " 'Institutional': 778,\n", " 'Institutions': 1,\n", " 'Instruction': 1,\n", " 'Instrumental': 1,\n", " 'Insurrection': 1,\n", " 'Integration': 1,\n", " 'Intelligence': 1,\n", " 'Interallied': 1,\n", " 'Intermediate': 1,\n", " 'International': 2,\n", " 'Internationally': 2,\n", " 'Interscholastic': 1,\n", " 'Intersection': 1,\n", " 'Intracoastal': 1,\n", " 'Introduktion': 1,\n", " 'Investigation': 1,\n", " 'Jacksonville': 17,\n", " 'JeanesTeachers': 1,\n", " 'Jeffersonville': 1,\n", " 'Katzenstein': 1,\n", " 'Kiinderscenen': 1,\n", " 'Kindergarten': 4,\n", " 'Kinderscenen': 1,\n", " 'Kirkpatrick': 2,\n", " 'Knickerbockers': 1,\n", " 'Kompositionen': 1,\n", " 'Laboratories': 13,\n", " 'Lamentations': 1,\n", " 'Landscaping': 1,\n", " 'Lawrenceville': 71,\n", " 'Legislative': 2,\n", " 'Legislators': 1,\n", " 'Legislature': 2,\n", " 'Lescribleur': 1,\n", " 'Letterheads': 2,\n", " 'Liebestrank': 1,\n", " 'Liebestraum': 1,\n", " 'Liebeswonne': 1,\n", " 'Lindencrone': 1,\n", " 'Lindenmeyer': 1,\n", " 'Lindpaintner': 3,\n", " 'Litchtenberg': 1,\n", " 'Lithographed': 3,\n", " 'Lithographic': 2,\n", " 'Lithographing': 2,\n", " 'Lithographs': 22,\n", " 'Lithography': 3,\n", " 'Livingstone': 4,\n", " 'Locomotives': 21,\n", " 'Mademoiselle': 1,\n", " 'Madisonville': 2,\n", " 'Mandolinata': 1,\n", " 'Mantelpiece': 2,\n", " 'Manufactures': 1,\n", " 'Manufacturing': 3,\n", " 'Manuscripts': 1,\n", " 'Marguerites': 1,\n", " 'Marigliotta': 2,\n", " 'Marketplaces': 10,\n", " 'Marlborough': 2,\n", " 'Marseillaise': 1,\n", " 'Masquerades': 2,\n", " 'Massachusetts': 12,\n", " 'Mathematics': 1,\n", " 'Maximillian': 7,\n", " 'McMinnville': 2,\n", " 'Mecklenburg': 1,\n", " 'Mediterranei': 2,\n", " 'Medringhaus': 2,\n", " 'Meistersinger': 2,\n", " 'Membranophones': 3,\n", " 'Mendelssohn': 29,\n", " 'Menosprecio': 1,\n", " 'Meriwtether': 1,\n", " 'Metalworking': 2,\n", " 'Middelburgum': 1,\n", " 'Middleditch': 2,\n", " 'Miscellaneous': 3,\n", " 'Missionaries': 3,\n", " 'Mississippi': 291,\n", " 'Mollenhauer': 2,\n", " 'Mondenschein': 1,\n", " 'Monseigneur': 2,\n", " 'Montecastle': 2,\n", " 'Montmorency': 1,\n", " 'Morningside': 6,\n", " 'Morrissette': 1,\n", " 'Mountaineer': 1,\n", " 'Multiplication': 1,\n", " 'Musikalische': 1,\n", " 'Napolitaine': 1,\n", " 'Naturalization': 1,\n", " 'Nebuchadnezzar': 1,\n", " 'Neighborhood': 2,\n", " 'Netherlands': 13,\n", " 'Newfoundland': 1,\n", " 'Newspictures': 3,\n", " 'Nightengale': 1,\n", " 'Nightingale': 3,\n", " 'Nonprescription': 1,\n", " 'Nonprojected': 3,\n", " 'Northampton': 24,\n", " 'Northumberland': 1,\n", " 'Northwestern': 1,\n", " 'Nouuellement': 3,\n", " 'Nullification': 1,\n", " 'Observatories': 10,\n", " 'Observatory': 17,\n", " 'Occidentales': 1,\n", " 'Occidentalioribus': 1,\n", " 'Offertories': 5,\n", " 'Opernthemas': 1,\n", " 'Oppugnation': 1,\n", " 'Organisations': 1,\n", " 'Organizations': 4,\n", " 'Orientation': 9,\n", " 'Orthography': 4,\n", " 'Outbuilding': 1,\n", " 'Outbuildings': 19,\n", " 'Outerbridge': 9,\n", " 'Outstanding': 1,\n", " 'Overexposed': 2,\n", " 'Oxfordshire': 21,\n", " 'Pagenstecker': 1,\n", " 'Pantagrueline': 2,\n", " 'Paraphrases': 1,\n", " 'Participation': 1,\n", " 'Pedestrians': 3,\n", " 'Penitential': 2,\n", " 'Pennsylvania': 35,\n", " 'Pennyslvania': 1,\n", " 'Pensilvania': 1,\n", " 'Performances': 12,\n", " 'Periodicals': 13,\n", " 'Peterborough': 1,\n", " 'Philadelphia': 743,\n", " 'Phildelphia': 1,\n", " 'Phillibrown': 16,\n", " 'Photocopies': 5,\n", " 'Photographed': 1,\n", " 'Photographers': 40,\n", " 'Photographic': 10,\n", " 'Photographs': 5524,\n", " 'Photography': 5521,\n", " 'Photogravure': 1,\n", " 'Photogravures': 223,\n", " 'Piccolomini': 1,\n", " 'Plaetsnyder': 1,\n", " 'Plantations': 4,\n", " 'Playgrounds': 17,\n", " 'Polytechnical': 2,\n", " 'Poniatowski': 1,\n", " 'Poplarville': 8,\n", " 'Posselwhite': 3,\n", " 'Preliminary': 3,\n", " 'Presbyteriain': 1,\n", " 'Presbyterian': 3,\n", " 'Presbyterians': 1,\n", " 'Presentation': 3,\n", " 'Presentazione': 1,\n", " 'Presidential': 16,\n", " 'Printseller': 9,\n", " 'Printsellers': 1,\n", " 'Privateering': 1,\n", " 'Proceedings': 1,\n", " 'Processions': 28,\n", " 'Prohibition': 4,\n", " 'Propagation': 1,\n", " 'Proprietors': 1,\n", " 'Protectionism': 1,\n", " 'Protestantism': 1,\n", " 'Protestation': 1,\n", " 'Psychotropic': 1,\n", " 'Publications': 5,\n", " 'Punchinello': 1,\n", " 'Punctuation': 1,\n", " 'Quesenberry': 1,\n", " 'RailroadCompany': 1,\n", " 'Rappahannock': 7,\n", " 'Ravenscroft': 1,\n", " 'Recollections': 1,\n", " 'Reconciliation': 1,\n", " 'Reconstruction': 3,\n", " 'Recreational': 10,\n", " 'Reflections': 1,\n", " 'Reformation': 1,\n", " 'Reformatories': 4,\n", " 'Reformatory': 8,\n", " 'Refreshments': 1,\n", " 'Refridgerators': 1,\n", " 'Refrigerator': 1,\n", " 'Refrigerators': 3,\n", " 'Registration': 19,\n", " 'Regulations': 1,\n", " 'Rehabilitation': 2,\n", " 'Remembrance': 1,\n", " 'Reminiscences': 2,\n", " 'Remonstrance': 4,\n", " 'Rentiesville': 2,\n", " 'Representations': 3,\n", " 'Residential': 1,\n", " 'Restaurants': 14,\n", " 'Restoration': 5,\n", " 'Resurrection': 2,\n", " 'Revolutionary': 2,\n", " 'Rheinberger': 3,\n", " 'Ritournelle': 1,\n", " 'Rittenhouse': 3,\n", " 'Riverfronts': 2,\n", " 'Rockefeller': 2,\n", " 'Rondolettos': 1,\n", " 'Rorzwaukaski': 2,\n", " 'Rosenbecker': 1,\n", " 'Rudersdorff': 2,\n", " 'Saltonstall': 2,\n", " 'Sandersville': 24,\n", " 'Satterfield': 1,\n", " 'Satterthwaite': 3,\n", " 'Saunnaituis': 2,\n", " 'Scarborough': 1,\n", " 'Scharfenberg': 13,\n", " 'Schleiffarth': 1,\n", " 'Schlepegrell': 1,\n", " 'Schlesinger': 1,\n", " 'Schnatterly': 1,\n", " 'Schoolbuildings': 1,\n", " 'Schoolhouse': 15,\n", " 'Schottisches': 17,\n", " 'Schottishce': 1,\n", " 'Schwalbenbotschaft': 1,\n", " 'Schwanengesang': 10,\n", " 'Searchlights': 1,\n", " 'Sebastianum': 1,\n", " 'Segregation': 18,\n", " 'Seneviratne': 1,\n", " 'Sesquicentennial': 7,\n", " 'Settlements': 45,\n", " 'Seventeenth': 4,\n", " 'Shackelford': 12,\n", " 'Shackleford': 2,\n", " 'Shaftesbury': 2,\n", " 'Shakespeare': 58,\n", " 'Shakespearean': 2,\n", " 'Shepherdstown': 3,\n", " 'Sherrington': 2,\n", " 'Shipbuilding': 1,\n", " 'Shoalwalter': 1,\n", " 'Silhouettes': 11,\n", " 'Smokestacks': 1,\n", " 'Somersville': 2,\n", " 'Sommernachtstraum': 2,\n", " 'Southampton': 10,\n", " 'Southwestern': 3,\n", " 'Spielmannslieder': 1,\n", " 'Spotslyvania': 1,\n", " 'Spotsylvania': 9,\n", " 'Spottsylvania': 1,\n", " 'Springfield': 6,\n", " 'Stadtkirche': 1,\n", " 'Stalactites': 3,\n", " 'Stallknecht': 2,\n", " 'Stepmothers': 1,\n", " 'Stereoscope': 1,\n", " 'Stereoscopic': 2,\n", " 'Stockbridge': 2,\n", " 'Storefronts': 3,\n", " 'Storytellers': 1,\n", " 'Streetscapes': 3,\n", " 'Stringfellow': 8,\n", " 'Stringfield': 1,\n", " 'Superintendent': 120,\n", " 'Superintendents': 12,\n", " 'Supervising': 2,\n", " 'Supervisors': 1,\n", " 'Supterintendent': 1,\n", " 'Surrounding': 2,\n", " 'Switzerland': 13,\n", " 'Tablecloths': 2,\n", " 'Tallahassee': 13,\n", " 'Tanzmomente': 1,\n", " 'Tanzskizzen': 1,\n", " 'Tappahannock': 1,\n", " 'Tarantellas': 3,\n", " 'Tchaikovsky': 2,\n", " 'Terraqueous': 1,\n", " 'Territorial': 2,\n", " 'Teschemacher': 1,\n", " 'Testimonial': 1,\n", " 'Thanksgiving': 1,\n", " 'Theological': 21,\n", " 'Therapentics': 1,\n", " 'Therapeutics': 2,\n", " 'Theyendanagea': 1,\n", " 'Thomasville': 1,\n", " 'Thunderbolt': 5,\n", " 'Tillinghast': 1,\n", " 'Timmonsville': 1,\n", " 'Topographic': 1,\n", " 'Topographical': 2,\n", " 'Transfusion': 1,\n", " 'Transilvania': 1,\n", " 'Translating': 1,\n", " 'Transparencies': 23,\n", " 'Transportation': 35,\n", " 'Transvestism': 28,\n", " 'Triosonatas': 1,\n", " 'Typographos': 1,\n", " 'Ueberwasser': 1,\n", " 'Unaccompanied': 10,\n", " 'Undentified': 1,\n", " 'Undergraduate': 8,\n", " 'Undergraduates': 8,\n", " 'Underground': 12,\n", " 'Unidentfied': 1,\n", " 'Unidentifed': 9,\n", " 'Unidentified': 718,\n", " 'Universitat': 1,\n", " 'Universities': 262,\n", " 'Universitut': 1,\n", " 'Universtity': 1,\n", " 'Unsatisfactory': 2,\n", " 'Unterhaltung': 1,\n", " 'Variationen': 1,\n", " 'Velimirovic': 2,\n", " 'Versification': 3,\n", " 'Villefranche': 1,\n", " 'Violoncello': 2,\n", " 'VirginiaFraternity': 1,\n", " 'Waccowochie': 1,\n", " 'Wagenknight': 3,\n", " 'Wallerstein': 3,\n", " 'Washingtons': 1,\n", " 'Watercolors': 3,\n", " 'Waterfronts': 3,\n", " 'Watermelons': 30,\n", " 'Weatherford': 2,\n", " 'Weissenborn': 1,\n", " 'Wertenbacker': 1,\n", " 'Wertenbaker': 12,\n", " 'Westerville': 1,\n", " 'Westmoreland': 4,\n", " 'Wheelbarrows': 5,\n", " 'Wheelchairs': 1,\n", " 'Wheelwright': 1,\n", " 'Wheelwrights': 1,\n", " 'Whitechurch': 1,\n", " 'Wiggleworth': 2,\n", " 'Williamsburg': 54,\n", " 'Williamsport': 2,\n", " 'Winsborough': 1,\n", " 'Winterhalter': 1,\n", " 'Wollenhaupt': 8,\n", " 'Woodworking': 13,\n", " 'Worthington': 4,\n", " 'Wrigglesworth': 2,\n", " 'accelerator': 1,\n", " 'accessories': 6,\n", " 'accommodating': 2,\n", " 'accommodations': 8,\n", " 'administration': 27,\n", " 'administrators': 2,\n", " 'adolescence': 1,\n", " 'advertisement': 2,\n", " 'advertising': 2,\n", " 'agricultural': 27,\n", " 'agriculture': 5,\n", " 'anniversary': 7,\n", " 'appassionata': 1,\n", " 'application': 1,\n", " 'appreciation': 1,\n", " 'approximately': 54,\n", " 'archeuesque': 2,\n", " 'architectural': 65,\n", " 'architecture': 317,\n", " 'artistiques': 1,\n", " 'association': 1,\n", " 'associations': 3,\n", " 'authorities': 2,\n", " 'automobiles': 1,\n", " 'automoblile': 1,\n", " 'battlefield': 1,\n", " 'beautifying': 3,\n", " 'benediction': 1,\n", " 'bottomlands': 1,\n", " 'caertmaecker': 1,\n", " 'capriccioso': 3,\n", " 'cartographic': 363,\n", " 'celebrating': 1,\n", " 'celebration': 1,\n", " 'celebrations': 9,\n", " 'chrestienne': 1,\n", " 'clarissimum': 1,\n", " 'combination': 1,\n", " 'communications': 1,\n", " 'composition': 152,\n", " 'comprehension': 1,\n", " 'confirmirte': 1,\n", " 'congregation': 1,\n", " 'connections': 6,\n", " 'consequences': 1,\n", " 'conservatory': 1,\n", " 'consolation': 1,\n", " 'consolidated': 3,\n", " 'consolidation': 1,\n", " 'constructed': 1,\n", " 'construction': 153,\n", " 'convocation': 2,\n", " 'cornerstone': 1,\n", " 'cultivation': 1,\n", " 'damoyselles': 1,\n", " 'declaration': 3,\n", " 'declination': 1,\n", " 'dedications': 1,\n", " 'deliberately': 1,\n", " 'demonstrates': 1,\n", " 'demonstration': 19,\n", " 'demonstrator': 1,\n", " 'demonstrators': 1,\n", " 'departments': 1,\n", " 'deportemens': 1,\n", " 'description': 4,\n", " 'destruction': 1,\n", " 'detereriorating': 5,\n", " 'deterioraing': 1,\n", " 'deteriorate': 1,\n", " 'deteriorating': 129,\n", " 'deterioration': 2,\n", " 'development': 21,\n", " 'developments': 1,\n", " 'dignitaries': 1,\n", " 'dilapidated': 1,\n", " 'disciplines': 10,\n", " 'discoloration': 22,\n", " 'discolorations': 1,\n", " 'discoveries': 4,\n", " 'disposition': 1,\n", " 'distinguished': 1,\n", " 'diversified': 7,\n", " 'educational': 1,\n", " 'eliminating': 1,\n", " 'engineering': 5,\n", " 'enseignemens': 1,\n", " 'enterprises': 3,\n", " 'entertainers': 4,\n", " 'entertainment': 2,\n", " 'environment': 1,\n", " 'environments': 9,\n", " 'eradicating': 1,\n", " 'exhibitions': 1,\n", " 'exploration': 3,\n", " 'expositions': 3,\n", " 'facinoribus': 1,\n", " 'furnishings': 2,\n", " 'generations': 1,\n", " 'geographischen': 1,\n", " 'governments': 1,\n", " 'granddaughter': 1,\n", " 'groundbreaking': 20,\n", " 'handwriting': 2,\n", " 'harpsichord': 3,\n", " 'headquarters': 1,\n", " 'hecatodistichon': 2,\n", " 'hlingsglaube': 1,\n", " 'horizontally': 2,\n", " 'hydrographica': 2,\n", " 'identifiable': 6,\n", " 'identifying': 1,\n", " 'illustrissime': 1,\n", " 'importables': 1,\n", " 'improvement': 9,\n", " 'improvements': 6,\n", " 'inauguration': 1,\n", " 'independant': 1,\n", " 'information': 15905,\n", " 'inhabitants': 1,\n", " 'inscription': 1,\n", " 'installation': 1,\n", " 'institution': 2,\n", " 'institutions': 2,\n", " 'instruction': 2,\n", " 'instructives': 1,\n", " 'instrumental': 451,\n", " 'instruments': 25,\n", " 'insurrections': 1,\n", " 'intellectual': 1,\n", " 'interdisciplinary': 2,\n", " 'interesting': 1,\n", " 'interjacentiumq': 1,\n", " 'interpretation': 1,\n", " 'interpreting': 1,\n", " 'introductio': 1,\n", " 'irregularly': 1,\n", " 'irresesitible': 1,\n", " 'irresistable': 1,\n", " 'lVniuersite': 1,\n", " 'laboratories': 1,\n", " 'laboratoryatue': 1,\n", " 'legislation': 2,\n", " 'legislature': 1,\n", " 'linstitution': 1,\n", " 'lithographer': 1,\n", " 'lithographs': 1,\n", " 'locomotives': 1,\n", " 'magnificence': 1,\n", " 'magnificque': 2,\n", " 'manufacture': 1,\n", " 'manufacturer': 1,\n", " 'manufacturing': 1,\n", " 'mathematics': 3,\n", " 'membranophones': 1,\n", " 'mercimoniis': 1,\n", " 'merveilleux': 1,\n", " 'mousquetaire': 1,\n", " 'napolitaine': 1,\n", " 'naturalization': 1,\n", " 'neighborhood': 3,\n", " 'nightingale': 1,\n", " 'nouuellement': 1,\n", " 'nstlerleben': 1,\n", " 'observations': 1,\n", " 'obstruction': 5,\n", " 'opportunity': 1,\n", " 'organization': 2,\n", " 'orientalioribus': 1,\n", " 'orientation': 2,\n", " 'oscilloscope': 1,\n", " 'overagainst': 1,\n", " 'overlooking': 1,\n", " 'participate': 1,\n", " 'particularly': 1,\n", " 'performances': 1,\n", " 'periodicals': 1,\n", " 'photographic': 10376,\n", " 'photographs': 6040,\n", " 'photography': 7,\n", " 'picturesque': 1,\n", " 'plantations': 2,\n", " 'practically': 1,\n", " 'predominantly': 1,\n", " 'preparation': 2,\n", " 'presentation': 5,\n", " 'principales': 1,\n", " 'printmaking': 1,\n", " 'productions': 32,\n", " 'prognostication': 2,\n", " 'progressives': 1,\n", " 'promptuaire': 1,\n", " 'promulgation': 1,\n", " 'prononciation': 1,\n", " 'provinciarum': 3,\n", " 'publication': 1,\n", " 'quarterback': 5,\n", " 'recognizable': 1,\n", " 'reconstruction': 7,\n", " 'recreations': 1,\n", " 'refreshments': 1,\n", " 'regulations': 1,\n", " 'relationship': 250,\n", " 'remembering': 1,\n", " 'representatives': 2,\n", " 'reproduction': 1,\n", " 'reproductions': 48,\n", " 'resiouissance': 2,\n", " 'respectibus': 1,\n", " 'resplendent': 1,\n", " 'restrictions': 5518,\n", " 'resurrection': 1,\n", " 'reverendissime': 1,\n", " 'sacrificateur': 1,\n", " 'sanctissima': 1,\n", " 'satisfactory': 3,\n", " 'schoolbuildings': 29,\n", " 'schottische': 3,\n", " 'sentimentale': 1,\n", " 'septemtrionale': 1,\n", " 'septentrionale': 1,\n", " 'settlements': 16,\n", " 'shareholder': 1,\n", " 'shopkeepers': 1,\n", " 'siciliennes': 1,\n", " 'sioujssances': 2,\n", " 'slenderness': 1,\n", " 'spectabilis': 1,\n", " 'spirituelles': 1,\n", " 'stalagmites': 3,\n", " 'stockholders': 1,\n", " 'strengthened': 1,\n", " 'subdivision': 1,\n", " 'superintendence': 1,\n", " 'supervising': 2,\n", " 'supervisors': 1,\n", " 'supplementary': 1,\n", " 'surrounding': 5,\n", " 'sweethearts': 2,\n", " 'temperaments': 1,\n", " 'temptations': 1,\n", " 'territories': 3,\n", " 'thanksgiving': 1,\n", " 'theological': 1,\n", " 'topographical': 1,\n", " 'traditional': 1,\n", " 'transgressions': 2,\n", " 'transparencies': 31,\n", " 'transportation': 3,\n", " 'transported': 2,\n", " 'transporting': 4,\n", " 'treschrestien': 5,\n", " 'tresexcellente': 1,\n", " 'tresillustre': 1,\n", " 'trespuissant': 1,\n", " 'triangulation': 1,\n", " 'triumphante': 3,\n", " 'ultracentrifuge': 2,\n", " 'unclassified': 1,\n", " 'undentified': 3,\n", " 'underground': 1,\n", " 'undernourished': 1,\n", " 'unidentifed': 4,\n", " 'unidentifiable': 1,\n", " 'unidentified': 146,\n", " 'universities': 1,\n", " 'unprotected': 1,\n", " 'unrecognizable': 1,\n", " 'ventilating': 1,\n", " 'vingthuictieme': 1,\n", " 'violoncello': 30,\n", " 'violoncellos': 1,\n", " 'waterfronts': 2,\n", " 'watermelons': 7,\n", " 'weatherboarded': 1,\n", " 'whitewashing': 1}" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "longwords" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from nltk.collocations import *\n", "finder = BigramCollocationFinder.from_words(filtered['ia']['filtered'])\n", "bigram_measures = nltk.collocations.BigramAssocMeasures()\n", "scored = finder.score_ngrams(bigram_measures.raw_freq)\n", "sorted(bigram for bigram, score in scored) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#This thing here just hangs forevs. I wonder if it's possible to do it without the notebook?\n", "from nltk.collocations import *\n", "bigram_measures = nltk.collocations.BigramAssocMeasures()\n", "trigram_measures = nltk.collocations.TrigramAssocMeasures()\n", "finder = BigramCollocationFinder.from_words(filteredgpo)\n", "finder.nbest(bigram_measures.pmi, 10)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "60168\n", "updating FD with biodiv\n", "131704\n", "updating FD with rumsey\n", "153719\n", "updating FD with commonwealth\n", "323186\n", "updating FD with georgia\n", "428524\n", "updating FD with harvard\n", "444294\n", "updating FD with ia\n", "853180\n", "updating FD with getty\n", "874037\n", "updating FD with kentucky\n", "879550\n", "updating FD with minnesota\n", "892810\n", "updating FD with missouri\n", "965647\n", "updating FD with mwdl\n", "1589542\n", "updating FD with nara\n", "2556336\n", "updating FD with nocar\n", "2689928\n", "updating FD with smiths\n", "2948721\n", "updating FD with socar\n", "2960280\n", "updating FD with texas\n", "3709827\n", "updating FD with gpo\n", "4056106\n", "updating FD with illinois\n", "4066429\n", "updating FD with usc\n", "4167642\n", "updating FD with virginia\n", "4193316\n", "updating FD with nocoll\n", "4193393\n" ] } ], "source": [ "import pickle\n", "import nltk\n", "\n", "colls = [\"biodiv\",\"rumsey\",\"commonwealth\",\"georgia\",\"harvard\",\n", " \"ia\",\"getty\",\"kentucky\",\"minnesota\",\"missouri\",\"mwdl\",\n", " \"nara\",\"nocar\",\"smiths\",\"socar\",\"texas\",\"gpo\",\"illinois\",\n", " \"usc\",\"virginia\",\"nocoll\"]\n", "\n", "#colls = [\"biodiv\"]\n", "\n", "fd = pickle.load( open( \"/media/storage/dpla-data/pickles/new/artstor_fd.p\", \"rb\" ) )\n", "fds = {}\n", "fds['artstor'] = fd\n", "print(len(fd))\n", "for coll in colls:\n", " tmp = pickle.load( open( \"/media/storage/dpla-data/pickles/new/\"+coll+\"_fd.p\", \"rb\" ) )\n", " print(\"updating FD with \" + coll)\n", " fds[coll] = tmp\n", " fd.update(tmp)\n", " print(len(fd))\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('library', 6117895),\n", " ('university', 4474393),\n", " ('digital', 4393278),\n", " ('county', 4101874),\n", " ('archives', 3912296),\n", " ('image', 3856146),\n", " ('http', 3622052),\n", " ('utah', 3508437),\n", " ('texas', 3088382),\n", " ('states', 3034554),\n", " ('united', 2980122),\n", " ('records', 2940508),\n", " ('collection', 2933041),\n", " ('edu', 2869869),\n", " ('text', 2743094),\n", " ('u', 2735080),\n", " ('1', 2651930),\n", " ('national', 2627569),\n", " ('use', 2369332),\n", " ('c', 2323676),\n", " ('state', 2222489),\n", " ('office', 2198024),\n", " ('libraries', 2112146),\n", " ('north', 2089362),\n", " ('carolina', 2021982),\n", " ('georgia', 2013630),\n", " ('ark', 1940101),\n", " ('california', 1810572),\n", " ('images', 1793602),\n", " ('unt', 1793154),\n", " ('history', 1785922),\n", " ('public', 1720702),\n", " ('department', 1687369),\n", " ('english', 1663863),\n", " ('southern', 1559202),\n", " ('death', 1533958),\n", " ('d', 1516996),\n", " ('copyright', 1463445),\n", " ('67531', 1454858),\n", " ('photographs', 1451551),\n", " ('texashistory', 1432924),\n", " ('information', 1398449),\n", " ('washington', 1308460),\n", " ('x', 1220153),\n", " ('smithsonian', 1199436),\n", " ('available', 1197530),\n", " ('research', 1195215),\n", " ('center', 1163841),\n", " ('government', 1150791),\n", " ('institution', 1147249),\n", " ('collections', 1140774),\n", " ('content', 1116803),\n", " ('newspapers', 1106143),\n", " ('west', 1094535),\n", " ('ga', 1093339),\n", " ('ut', 1090170),\n", " ('administration', 1068841),\n", " ('college', 1065094),\n", " ('los', 1059295),\n", " ('angeles', 1047333),\n", " ('photograph', 1045279),\n", " ('ca', 1025436),\n", " ('local', 1007516),\n", " ('census', 1003194),\n", " ('terms', 996437),\n", " ('service', 988342),\n", " ('portal', 967273),\n", " ('american', 961283),\n", " ('mountain', 960732),\n", " ('may', 940055),\n", " ('made', 934848),\n", " ('statistics', 914170),\n", " ('special', 886509),\n", " ('defense', 868309),\n", " ('disk', 866632),\n", " ('ed', 862066),\n", " ('p', 856817),\n", " ('vital', 853095),\n", " ('agency', 852556),\n", " ('2', 845176),\n", " ('5', 841400),\n", " ('archive', 839173),\n", " ('gordon', 826933),\n", " ('usc', 782346),\n", " ('museum', 775978),\n", " ('see', 773608),\n", " ('historical', 772560),\n", " ('dept', 764440),\n", " ('j', 761928),\n", " ('n', 757319),\n", " ('magnetic', 747803),\n", " ('title', 747283),\n", " ('tex', 734878),\n", " ('america', 722533),\n", " ('photographic', 721011),\n", " ('part', 719264),\n", " ('including', 718627),\n", " ('1905', 715122),\n", " ('3', 713008),\n", " ('unrestricted', 706048),\n", " ('contact', 677717),\n", " ('w', 676065),\n", " ('md', 668947),\n", " ('study', 664102),\n", " ('park', 663796),\n", " ('nmnh', 649509),\n", " ('maps', 647804),\n", " ('b', 643911),\n", " ('cgi', 641422),\n", " ('10', 638652),\n", " ('south', 635438),\n", " ('secretary', 634762),\n", " ('1958', 631306),\n", " ('4', 625355),\n", " ('newspaper', 620590),\n", " ('please', 616244),\n", " ('map', 613643),\n", " ('01', 606953),\n", " ('include', 605423),\n", " ('1939', 605294),\n", " ('permission', 602815),\n", " ('new', 602509),\n", " ('anthropology', 597459),\n", " ('project', 587240),\n", " ('private', 587021),\n", " ('created', 583921),\n", " ('org', 582788),\n", " ('2008', 579928),\n", " ('8', 579199),\n", " ('lib', 571698),\n", " ('certificates', 567808),\n", " ('rights', 566483),\n", " ('certificate', 560875),\n", " ('health', 560430),\n", " ('contents', 559924),\n", " ('e', 557717),\n", " ('heritage', 550587),\n", " ('cm', 549198),\n", " ('atlanta', 547042),\n", " ('teaching', 544659),\n", " ('botany', 542307),\n", " ('7', 541722),\n", " ('full', 537197),\n", " ('general', 523510),\n", " ('sound', 516121),\n", " ('holding', 515365),\n", " ('video', 507502),\n", " ('11', 504369),\n", " ('society', 501581),\n", " ('12', 498668),\n", " ('calhoun', 494892),\n", " ('gpo', 492130),\n", " ('2007', 489677),\n", " ('17', 484438),\n", " ('recordings', 484166),\n", " ('mission', 482853),\n", " ('publicly', 481311),\n", " ('partners', 480262),\n", " ('thumbnail', 477850),\n", " ('itemurl', 477639),\n", " ('thumbnailurl', 477639),\n", " ('advertising', 475246),\n", " ('art', 473332),\n", " ('identifier', 473008),\n", " ('pictures', 472344),\n", " ('domain', 471126),\n", " ('military', 469395),\n", " ('l', 467158),\n", " ('h', 466419),\n", " ('assigned', 465535),\n", " ('specol', 465295),\n", " ('still', 463899),\n", " ('material', 462158),\n", " ('memorial', 461098),\n", " ('city', 455185),\n", " ('us', 444041),\n", " ('boston', 440872),\n", " ('war', 438395),\n", " ('includes', 433659),\n", " ('resource', 432396),\n", " ('number', 432306),\n", " ('213', 415973),\n", " ('news', 411263),\n", " ('arizona', 409384),\n", " ('internet', 408349),\n", " ('21', 406888),\n", " ('law', 406216),\n", " ('m', 401417),\n", " ('de', 400958),\n", " ('bureau', 392174),\n", " ('print', 385810),\n", " ('john', 385392),\n", " ('laws', 385150),\n", " ('22', 381806),\n", " ('www', 379402),\n", " ('white', 378232),\n", " ('business', 378114),\n", " ('90089', 377826),\n", " ('20', 376706),\n", " ('13', 376208),\n", " ('massachusetts', 375639),\n", " ('files', 374845),\n", " ('daily', 372969),\n", " ('09', 372792),\n", " ('january', 371526),\n", " ('1921', 368779),\n", " ('item', 368361),\n", " ('wpa', 368349),\n", " ('03', 366716),\n", " ('century', 365267),\n", " ('doheny', 364968),\n", " ('drive', 364385),\n", " ('0189', 363865),\n", " ('household', 363696),\n", " ('commercial', 363215),\n", " ('international', 361430),\n", " ('02', 361132),\n", " ('division', 359732),\n", " ('kentucky', 358883),\n", " ('15', 357764),\n", " ('school', 356600),\n", " ('inc', 355061),\n", " ('printing', 353855),\n", " ('gov', 352039),\n", " ('16', 348978),\n", " ('hard', 348575),\n", " ('lake', 344570),\n", " ('6', 343146),\n", " ('documents', 339507),\n", " ('06', 335715),\n", " ('07', 335134),\n", " ('14', 334743),\n", " ('file', 334434),\n", " ('page', 332226),\n", " ('05', 331733),\n", " ('9', 329820),\n", " ('places', 328713),\n", " ('04', 328619),\n", " ('institute', 328372),\n", " ('1982', 325214),\n", " ('1994', 325061),\n", " ('f', 324959),\n", " ('air', 324400),\n", " ('oclc', 322550),\n", " ('purposes', 322339),\n", " ('vol', 321707),\n", " ('protection', 321439),\n", " ('getty', 320777),\n", " ('bin', 320732),\n", " ('works', 320621),\n", " ('hutzel', 315352),\n", " ('died', 315245),\n", " ('house', 314953),\n", " ('forces', 314721),\n", " ('g', 314565),\n", " ('r', 314401),\n", " ('visual', 312926),\n", " ('18', 311659),\n", " ('black', 310081),\n", " ('24', 309774),\n", " ('agreement', 308813),\n", " ('23', 307921),\n", " ('one', 307763),\n", " ('2006', 304967),\n", " ('company', 303990),\n", " ('retained', 303438),\n", " ('services', 302549),\n", " ('education', 301797),\n", " ('intellectual', 301731),\n", " ('photo', 301328),\n", " ('hill', 301178),\n", " ('30', 300117),\n", " ('35', 299784),\n", " ('cont', 297790),\n", " ('reserve', 297783),\n", " ('2014', 297528),\n", " ('cards', 295644),\n", " ('pages', 294533),\n", " ('st', 293662),\n", " ('exceptions', 293184),\n", " ('25', 292434),\n", " ('view', 290866),\n", " ('housing', 289670),\n", " ('finance', 287345),\n", " ('improve', 285861),\n", " ('combined', 283872),\n", " ('proof', 283816),\n", " ('access', 283798),\n", " ('san', 282709),\n", " ('district', 282093),\n", " ('development', 281951),\n", " ('08', 281868),\n", " ('fill', 280758),\n", " ('street', 280641),\n", " ('certification', 279874),\n", " ('1960', 279644),\n", " ('gaps', 278142),\n", " ('management', 277529),\n", " ('familysearch', 277402),\n", " ('keypath', 277397),\n", " ('indexesresults', 277397),\n", " ('runwhat', 277397),\n", " ('idxfiles', 277397),\n", " ('v', 277391),\n", " ('william', 275976),\n", " ('2011', 275061),\n", " ('economics', 271972),\n", " ('geography', 271143),\n", " ('online', 268800),\n", " ('forest', 268767),\n", " ('2013', 267752),\n", " ('materials', 266942),\n", " ('communications', 266869),\n", " ('book', 266353),\n", " ('negative', 264987),\n", " ('19', 264908),\n", " ('salt', 264653),\n", " ('plants', 264340),\n", " ('28', 263042),\n", " ('dpla', 262957),\n", " ('color', 262938),\n", " ('along', 262712),\n", " ('environmental', 259654),\n", " ('jones', 259337),\n", " ('1940', 258904),\n", " ('obituaries', 258478),\n", " ('2000', 256307),\n", " ('mm', 254211),\n", " ('type', 254072),\n", " ('virginia', 253372),\n", " ('italy', 252068),\n", " ('index', 251253),\n", " ('athens', 250838),\n", " ('program', 249468),\n", " ('document', 249230),\n", " ('must', 248981),\n", " ('2003', 246499),\n", " ('minnesota', 246439),\n", " ('abilene', 245618),\n", " ('26', 244854),\n", " ('29', 243883),\n", " ('register', 242169),\n", " ('young', 241482),\n", " ('early', 239921),\n", " ('fax', 239667),\n", " ('educational', 239320),\n", " ('chapel', 238765),\n", " ('co', 238304),\n", " ('publications', 238114),\n", " ('system', 236973),\n", " ('periodicals', 235930),\n", " ('plantae', 235889),\n", " ('george', 234770),\n", " ('publication', 233713),\n", " ('association', 233414),\n", " ('2001', 232500),\n", " ('york', 232489),\n", " ('html', 230373),\n", " ('max', 230254),\n", " ('two', 229834),\n", " ('historic', 229645),\n", " ('phone', 228783),\n", " ('ocolc', 226412),\n", " ('world', 225206),\n", " ('board', 225179),\n", " ('david', 225159),\n", " ('zoology', 224905),\n", " ('0', 224735),\n", " ('digitized', 224072),\n", " ('27', 223035),\n", " ('leslie', 222591),\n", " ('prints', 222392),\n", " ('journalism', 221812),\n", " ('survey', 221666),\n", " ('events', 221545),\n", " ('required', 220720),\n", " ('published', 220199),\n", " ('pictorial', 219600),\n", " ('o', 219307),\n", " ('report', 218246),\n", " ('federal', 217026),\n", " ('officials', 216327),\n", " ('activities', 215985),\n", " ('site', 212735),\n", " ('lccn', 212473),\n", " ('people', 212459),\n", " ('2005', 212395),\n", " ('negatives', 212081),\n", " ('non', 211659),\n", " ('1900', 210990),\n", " ('life', 210606),\n", " ('1990', 210331),\n", " ('usa', 210289),\n", " ('multimedia', 209791),\n", " ('740', 209446),\n", " ('reserved', 209199),\n", " ('2010', 208823),\n", " ('microfilm', 208575),\n", " ('also', 207242),\n", " ('flowering', 206963),\n", " ('subject', 206775),\n", " ('1920', 206671),\n", " ('contentdm', 206566),\n", " ('dallas', 206113),\n", " ('ferns', 204716),\n", " ('areas', 203520),\n", " ('821', 203110),\n", " ('relating', 202918),\n", " ('2343', 201969),\n", " ('2366', 201736),\n", " ('1950', 200446),\n", " ('atlas', 199019),\n", " ('medical', 198449),\n", " ('31', 197786),\n", " ('cultural', 197434),\n", " ('section', 197143),\n", " ('without', 196840),\n", " ('surveys', 196775),\n", " ('portraits', 196242),\n", " ('louis', 196128),\n", " ('nc', 195737),\n", " ('series', 195587),\n", " ('ill', 195430),\n", " ('church', 195243),\n", " ('jan', 194458),\n", " ('architecture', 193697),\n", " ('paper', 192058),\n", " ('james', 191054),\n", " ('portrait', 190915),\n", " ('net', 188361),\n", " ('physical', 187254),\n", " ('aug', 187237),\n", " ('first', 187147),\n", " ('river', 186862),\n", " ('code', 186740),\n", " ('agriculture', 186244),\n", " ('electronic', 186066),\n", " ('attorney', 185874),\n", " ('building', 184663),\n", " ('block', 184394),\n", " ('courtesy', 184024),\n", " ('commerce', 184016),\n", " ('1860', 182868),\n", " ('animalia', 182830),\n", " ('handle', 182051),\n", " ('resources', 181443),\n", " ('relation', 181137),\n", " ('protected', 180416),\n", " ('civil', 180409),\n", " ('design', 180214),\n", " ('women', 179205),\n", " ('dicotyledonae', 179177),\n", " ('nevada', 179151),\n", " ('interior', 179110),\n", " ('social', 178571),\n", " ('enumeration', 178186),\n", " ('willard', 176096),\n", " ('weekly', 175513),\n", " ('family', 175324),\n", " ('buildings', 175169),\n", " ('space', 175042),\n", " ('paul', 174826),\n", " ('water', 174802),\n", " ('box', 174733),\n", " ('2004', 174469),\n", " ('men', 174309),\n", " ('card', 173753),\n", " ('hdl', 173582),\n", " ('work', 172733),\n", " ('region', 172700),\n", " ('personal', 172563),\n", " ('accession', 171192),\n", " ('marriott', 171158),\n", " ('edward', 170423),\n", " ('photography', 169921),\n", " ('ethnology', 168295),\n", " ('86', 167746),\n", " ('help', 166729),\n", " ('1947', 166686),\n", " ('rumsey', 166593),\n", " ('provided', 166261),\n", " ('invertebrate', 165889),\n", " ('charles', 165838),\n", " ('10020', 165827),\n", " ('jun', 165621),\n", " ('descriptions', 165543),\n", " ('indians', 165270),\n", " ('regulations', 165133),\n", " ('oct', 164313),\n", " ('105', 163809),\n", " ('2002', 162990),\n", " ('fort', 162660),\n", " ('language', 162111),\n", " ('brigham', 161673),\n", " ('student', 161631),\n", " ('group', 161578),\n", " ('scholarship', 161302),\n", " ('missouri', 160794),\n", " ('repository', 160549),\n", " ('ink', 159649),\n", " ('granted', 159447),\n", " ('employee', 159238),\n", " ('1945', 158703),\n", " ('go', 157813),\n", " ('right', 157809),\n", " ('photographer', 157783),\n", " ('press', 157356),\n", " ('1910', 156652),\n", " ('basel', 156514),\n", " ('written', 156014),\n", " ('governed', 155957),\n", " ('received', 155705),\n", " ('htm', 155475),\n", " ('pursuant', 154863),\n", " ('jul', 154733),\n", " ('volumes', 154694),\n", " ('clipping', 154343),\n", " ('left', 154335),\n", " ('unknown', 154220),\n", " ('thomas', 154133),\n", " ('field', 153549),\n", " ('2009', 153404),\n", " ('glass', 153129),\n", " ('sep', 153080),\n", " ('medieval', 152980),\n", " ('idaho', 152927),\n", " ('1930', 152870),\n", " ('risk', 152495),\n", " ('1911', 152192),\n", " ('arte', 151898),\n", " ('entomology', 151555),\n", " ('etc', 150509),\n", " ('2012', 150425),\n", " ('antonio', 150363),\n", " ('id', 149367),\n", " ('used', 149299),\n", " ('distribution', 149213),\n", " ('indian', 149011),\n", " ('public_domain_copyright_notice', 148603),\n", " ('pdf', 148440),\n", " ('dwellings', 148387),\n", " ('foto', 147761),\n", " ('minore', 147644),\n", " ('jpg', 146458),\n", " ('34', 146391),\n", " ('customs', 146179),\n", " ('freely', 146062),\n", " ('gallery', 145645),\n", " ('la', 145560),\n", " ('brothers', 144934),\n", " ('hosted', 144340),\n", " ('area', 143776),\n", " ('solely', 142953),\n", " ('front', 142895),\n", " ('students', 141998),\n", " ('papers', 141136),\n", " ('date', 140889),\n", " ('byu', 140345),\n", " ('displayed', 140175),\n", " ('mexico', 140084),\n", " ('cornell', 140057),\n", " ('committee', 140028),\n", " ('subsequent', 139334),\n", " ('german', 139105),\n", " ('1865', 138987),\n", " ('1980', 138962),\n", " ('property', 138813),\n", " ('biodiversity', 138775),\n", " ('bhl', 138362),\n", " ('four', 137459),\n", " ('118', 136952),\n", " ('natural', 136814),\n", " ('control', 136766),\n", " ('commonwealth', 136381),\n", " ('pencil', 136271),\n", " ('east', 136139),\n", " ('urban', 135941),\n", " ('henry', 135170),\n", " ('accordance', 134266),\n", " ('reproduction', 133619),\n", " ('landscape', 133579),\n", " ('well', 133358),\n", " ('el', 132876),\n", " ('june', 132790),\n", " ('children', 132419),\n", " ('views', 132410),\n", " ('1861', 132273),\n", " ('lee', 131946),\n", " ('pacific', 131651),\n", " ('unit', 131097),\n", " ('mass', 130764),\n", " ('creative', 130420),\n", " ('known', 130333),\n", " ('collected', 129987),\n", " ('address', 129929),\n", " ('extensive', 129875),\n", " ('mar', 129691),\n", " ('born', 129618),\n", " ('record', 129343),\n", " ('held', 129204),\n", " ('operations', 128698),\n", " ('african', 128237),\n", " ('presented', 128205),\n", " ('searched', 127683),\n", " ('electronically', 127637),\n", " ('owning', 127478),\n", " ('kdl', 127243),\n", " ('kyvl', 127243),\n", " ('illinois', 126520),\n", " ('trust', 126373),\n", " ('apr', 125606),\n", " ('year', 125582),\n", " ('sheet', 125238),\n", " ('form', 125225),\n", " ('y', 125130),\n", " ('restrictions', 125105),\n", " ('brown', 125012),\n", " ('robert', 124580),\n", " ('1931', 124444),\n", " ('open', 124106),\n", " ('army', 124086),\n", " ('tewksbury', 123514),\n", " ('1961', 122806),\n", " ('inches', 122774),\n", " ('1944', 122669),\n", " ('1901', 122437),\n", " ('geological', 122048),\n", " ('americans', 121953),\n", " ('unc', 121856),\n", " ('silver', 121508),\n", " ('data', 121190),\n", " ('man', 121139),\n", " ('identification', 121112),\n", " ('greensboro', 121093),\n", " ('letter', 121034),\n", " ('original', 120867),\n", " ('congress', 120865),\n", " ('force', 120828),\n", " ('economic', 120746),\n", " ('web', 120539),\n", " ('33', 120428),\n", " ('1988', 120197),\n", " ('version', 120127),\n", " ('july', 120049),\n", " ('smith', 119910),\n", " ('wilson', 118842),\n", " ('jackson', 118782),\n", " ('postcards', 118525),\n", " ('feb', 118024),\n", " ('branch', 117825),\n", " ('yale', 117823),\n", " ('charleston', 117478),\n", " ('marine', 117433),\n", " ('march', 117428),\n", " ('japan', 117362),\n", " ('breckenridge', 117142),\n", " ('president', 116931),\n", " ('journal', 116872),\n", " ('austin', 116517),\n", " ('1897', 116517),\n", " ('20th', 116413),\n", " ('geographic', 116068),\n", " ('high', 115940),\n", " ('underwood', 115422),\n", " ('1929', 115352),\n", " ('construction', 115301),\n", " ('affairs', 114797),\n", " ('dc', 114710),\n", " ('cdm', 114165),\n", " ('forms', 114104),\n", " ('houston', 114065),\n", " ('1968', 113695),\n", " ('photographers', 113481),\n", " ('galveston', 113434),\n", " ('reproduced', 113272),\n", " ('current', 112959),\n", " ('day', 112493),\n", " ('000', 112018),\n", " ('mrs', 111773),\n", " ('studio', 111396),\n", " ('1977', 111183),\n", " ('savannah', 110938),\n", " ('commission', 110714),\n", " ('columbia', 110364),\n", " ('topographic', 110106),\n", " ('arts', 110082),\n", " ('annual', 110025),\n", " ('near', 109753),\n", " ('1913', 109285),\n", " ('headquarters', 108867),\n", " ('1898', 108827),\n", " ('1870', 108769),\n", " ('scale', 108352),\n", " ('1884', 107901),\n", " ('director', 107781),\n", " ('hall', 107747),\n", " ('arthropoda', 107737),\n", " ('dec', 107661),\n", " ('1974', 107275),\n", " ('viewed', 106833),\n", " ('1970', 106778),\n", " ('impa', 106393),\n", " ('joseph', 105617),\n", " ('publishing', 105220),\n", " ('1960s', 105116),\n", " ('assignment', 105086),\n", " ('commons', 105012),\n", " ('campus', 104760),\n", " ('johnson', 104500),\n", " ('personnel', 104442),\n", " ('regents', 104304),\n", " ('gelatin', 104134),\n", " ('case', 103803),\n", " ('consortium', 103687),\n", " ('navy', 103680),\n", " ('cite', 103655),\n", " ('1880', 103470),\n", " ('paso', 103317),\n", " ('almshouse', 103020),\n", " ('montana', 102957),\n", " ('attribution', 102937),\n", " ('agricultural', 102773),\n", " ('senate', 102581),\n", " ('road', 102546),\n", " ('printed', 102276),\n", " ('french', 102025),\n", " ('cooper', 101934),\n", " ('1890', 101527),\n", " ('postcard', 101328),\n", " ('anderson', 101323),\n", " ('film', 101230),\n", " ('harold', 101015),\n", " ('central', 100930),\n", " ('18th', 100896),\n", " ('screen', 100874),\n", " ('sculpture', 100776),\n", " ('station', 100669),\n", " ('1967', 100337),\n", " ('wpacards', 100170),\n", " ('m38843', 99924),\n", " ('home', 99757),\n", " ('community', 99321),\n", " ('unidentified', 99280),\n", " ('1976', 99236),\n", " ('nov', 99147),\n", " ('objects', 99025),\n", " ('legislature', 99007),\n", " ('beyond', 98849),\n", " ('des', 98797),\n", " ('april', 98551),\n", " ('nature', 98401),\n", " ('bombus', 98357),\n", " ('funds', 98269),\n", " ('orange', 98133),\n", " ('clark', 97991),\n", " ('showing', 97933),\n", " ('aircraft', 97735),\n", " ('col', 97424),\n", " ('fair', 97124),\n", " ('classified', 97042),\n", " ('september', 97042),\n", " ('london', 96900),\n", " ('1914', 96875),\n", " ('october', 96659),\n", " ('given', 96657),\n", " ('three', 96618),\n", " ('administrative', 96053),\n", " ('monument', 95935),\n", " ('et', 95815),\n", " ('taylor', 95739),\n", " ('supported', 95486),\n", " ('old', 95416),\n", " ('paris', 95191),\n", " ('technology', 94990),\n", " ('uses', 94581),\n", " ('back', 94566),\n", " ('valley', 94549),\n", " ('photos', 94251),\n", " ('post', 94063),\n", " ('1909', 93737),\n", " ('years', 93596),\n", " ('saint', 93247),\n", " ('1912', 93169),\n", " ('province', 93153),\n", " ('administrator', 93103),\n", " ('mary', 93021),\n", " ('western', 92921),\n", " ('32', 92583),\n", " ('documentation', 92467),\n", " ('div', 92395),\n", " ('mr', 91921),\n", " ('obituary', 91676),\n", " ('via', 91507),\n", " ('transportation', 91502),\n", " ('monuments', 91381),\n", " ('science', 91042),\n", " ('town', 90978),\n", " ('africa', 90791),\n", " ('mission21', 90786),\n", " ('complex', 90711),\n", " ('references', 90398),\n", " ('documenting', 89892),\n", " ('administered', 89517),\n", " ('industry', 89006),\n", " ('1904', 88889),\n", " ('1985', 88667),\n", " ('governor', 88513),\n", " ('shows', 88507),\n", " ('sa', 88390),\n", " ('1917', 88367),\n", " ('1938', 88319),\n", " ('1934', 88316),\n", " ('temple', 88292),\n", " ('plant', 88071),\n", " ('homeplace', 87990),\n", " ('music', 87664),\n", " ('lsta', 87615),\n", " ('island', 87472),\n", " ('com', 87179),\n", " ('february', 86945),\n", " ('1895', 86620),\n", " ('woman', 86612),\n", " ('uintah', 86384),\n", " ('corps', 86376),\n", " ('1936', 86243),\n", " ('requests', 86169),\n", " ('php', 86149),\n", " ('herald', 86109),\n", " ('architectural', 86035),\n", " ('staff', 86024),\n", " ('1949', 85957),\n", " ('china', 85920),\n", " ('renaissance', 85865),\n", " ('1886', 85680),\n", " ('november', 85606),\n", " ('1918', 85453),\n", " ('fire', 85333),\n", " ('dr', 85104),\n", " ('august', 84869),\n", " ('native', 84753),\n", " ('arthur', 84674),\n", " ('legal', 84631),\n", " ('medicine', 84585),\n", " ('country', 84526),\n", " ('1975', 84336),\n", " ('asia', 84283),\n", " ('1935', 84264),\n", " ('land', 84007),\n", " ('sc', 83995),\n", " ('schools', 83983),\n", " ('1907', 83978),\n", " ('1951', 83959),\n", " ('lowell', 83688),\n", " ('assistant', 83635),\n", " ('42', 83578),\n", " ('intake', 83383),\n", " ('1956', 83372),\n", " ('specimen', 83361),\n", " ('time', 83317),\n", " ('description', 83261),\n", " ('small', 83247),\n", " ('1989', 83077),\n", " ('result', 83032),\n", " ('notes', 82788),\n", " ('reno', 82539),\n", " ('hewitt', 82525),\n", " ('va', 82463),\n", " ('tichnor', 82428),\n", " ('photographed', 82332),\n", " ('user', 82259),\n", " ('1942', 82008),\n", " ('obtained', 81803),\n", " ('list', 81773),\n", " ('baroque', 81668),\n", " ('side', 81633),\n", " ('appalachian', 81544),\n", " ('ii', 81526),\n", " ('organized', 81510),\n", " ('1933', 81503),\n", " ('programs', 81467),\n", " ('licenses', 81248),\n", " ('shipler', 81143),\n", " ('ocean', 80955),\n", " ('restricted', 80953),\n", " ('1987', 80923),\n", " ('frank', 80794),\n", " ('hospital', 80752),\n", " ('bibliographical', 80726),\n", " ('sites', 80710),\n", " ('1941', 80699),\n", " ('creativecommons', 80600),\n", " ('k', 80502),\n", " ('confederate', 80171),\n", " ('1915', 80024),\n", " ('power', 79771),\n", " ('walter', 79710),\n", " ('december', 79684),\n", " ('1850', 79519),\n", " ('individuals', 79174),\n", " ('prior', 79050),\n", " ('1943', 79023),\n", " ('eight', 79006),\n", " ('red', 78747),\n", " ('based', 78642),\n", " ('1908', 78581),\n", " ('gift', 78580),\n", " ('class', 78575),\n", " ('gothic', 78574),\n", " ('archaeology', 78568),\n", " ('richard', 78498),\n", " ('issued', 78486),\n", " ('session', 78459),\n", " ('members', 78346),\n", " ('romanesque', 78254),\n", " ('geologic', 78225),\n", " ('referred', 78141),\n", " ('nail', 78137),\n", " ('copy', 78007),\n", " ('examiner', 77946),\n", " ('1919', 77801),\n", " ('sciences', 77626),\n", " ('1972', 77583),\n", " ('1906', 77495),\n", " ('labor', 77399),\n", " ('large', 77087),\n", " ('gri', 77016),\n", " ('1962', 76933),\n", " ('politics', 76858),\n", " ('express', 76795),\n", " ('jr', 76613),\n", " ('security', 76405),\n", " ('scholar', 76358),\n", " ('base', 76119),\n", " ('standing', 76087),\n", " ('worth', 75775),\n", " ('opinions', 75772),\n", " ('islands', 75771),\n", " ('1903', 75761),\n", " ('union', 75761),\n", " ('nasa', 75756),\n", " ('1923', 75604),\n", " ('great', 75569),\n", " ('free', 75110),\n", " ('brenham', 75033),\n", " ('40', 74960),\n", " ('canyon', 74877),\n", " ('anthropological', 74734),\n", " ('1899', 74646),\n", " ('creek', 74628),\n", " ('romans', 74564),\n", " ('1928', 74549),\n", " ('otherwise', 74448),\n", " ('france', 74395),\n", " ('thorough', 74298),\n", " ('background', 74195),\n", " ('forests', 74118),\n", " ('harvard', 74020),\n", " ('1969', 73985),\n", " ('artstor', 73982),\n", " ('etruscans', 73875),\n", " ('1922', 73858),\n", " ('aerial', 73811),\n", " ('1916', 73791),\n", " ('support', 73740),\n", " ('1937', 73734),\n", " ('cat340573', 73646),\n", " ('place', 73577),\n", " ('1924', 73514),\n", " ('fulton', 73489),\n", " ('insects', 73347),\n", " ('1946', 73079),\n", " ('manuscript', 72832),\n", " ('uncg', 72805),\n", " ('training', 72779),\n", " ('mail', 72687),\n", " ('1963', 72645),\n", " ('thursday', 72533),\n", " ('second', 72314),\n", " ('1955', 72212),\n", " ('bastrop', 71902),\n", " ('avenue', 71864),\n", " ('1997', 71741),\n", " ('ariz', 71521),\n", " ('azlibrary', 71327),\n", " ('1965', 71258),\n", " ('application', 71055),\n", " ('azmemory', 70966),\n", " ('info', 70901),\n", " ('correspondence', 70645),\n", " ('1952', 70592),\n", " ('order', 70491),\n", " ('practices', 70254),\n", " ('1954', 70162),\n", " ('1953', 70149),\n", " ('palmer', 69983),\n", " ('uss', 69834),\n", " ('1883', 69703),\n", " ('format', 69689),\n", " ('many', 69679),\n", " ('harris', 69591),\n", " ...]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fd.most_common()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2773182" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(fd.hapaxes())" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['contiiinecl',\n", " 'greise',\n", " 'genealogicalreco1888spof',\n", " 'noveluber',\n", " 'afgestooken',\n", " 'ocm15045154',\n", " '6455965',\n", " 'idx208420048060',\n", " '6406649',\n", " 'dnsc8706589',\n", " 'dnsc8706587',\n", " 'dnsc8706586',\n", " 'dnsc8706585',\n", " 'firstprincipleso00parl_0',\n", " 'firstprincipleso00parl_1',\n", " '2442383',\n", " '1318714',\n", " '04921800',\n", " 'wintkagasspan',\n", " 'choegun',\n", " 'd9e953e37b27cb8029cb2e5ca4ee690f',\n", " '6406647',\n", " '346373',\n", " 'ocm57308713',\n", " '4a329a29362c8693d61df65f222b60b4',\n", " '346376',\n", " 'dfst8304359',\n", " 'dnsc9001680',\n", " '04404800',\n", " 'dfst9106751',\n", " '652700225',\n", " 'meeler',\n", " '225251',\n", " 'underbank',\n", " 'ocm49779203',\n", " 'ufers',\n", " 'riilcil',\n", " 'qurxtity',\n", " '284358',\n", " '284359',\n", " '45882508',\n", " 'ficantly',\n", " '284352',\n", " '284353',\n", " '284351',\n", " '284356',\n", " '284357',\n", " '284355',\n", " '51758736',\n", " 'dnsc9001682',\n", " '06878700',\n", " 'c944f95bb4c850d0326c9bf7b6e35839',\n", " '412_dsp_waste2energy_0051',\n", " '469492',\n", " 'racticul',\n", " 'ahc071015002a',\n", " '2155521',\n", " 'wageindex',\n", " '7afeffc8f2e92619106',\n", " '2155522',\n", " '6698004',\n", " 'diarioenquesepro00cigo',\n", " '6698005',\n", " '0160734517',\n", " 'bankroll',\n", " 'ocm41962946',\n", " '236485020',\n", " '6698000',\n", " '870466468',\n", " '6698001',\n", " 'dfsd0202493',\n", " 'ahc1702149001',\n", " 'dfsd0202490',\n", " 'fiy',\n", " 'peble',\n", " 'colvtalner',\n", " '39999064270257',\n", " '428077796',\n", " 'dnsc8706588',\n", " 'veterinaryhomopa00hurn',\n", " '6698009',\n", " 'idx208420020051',\n", " 'idx208420020050',\n", " 'idx208420020053',\n", " 'idx208420020052',\n", " 'idx208420020055',\n", " 'idx208420020054',\n", " 'idx208420020057',\n", " 'idx208420020056',\n", " 'idx208420020059',\n", " 'idx208420020058',\n", " 'manipulaton',\n", " 'todod',\n", " '40_cfd_os_2004_1201_141_404',\n", " '40_cfd_os_2004_1201_141_405',\n", " '40_cfd_os_2004_1201_141_406',\n", " '40_cfd_os_2004_1201_141_400',\n", " '40_cfd_os_2004_1201_141_401',\n", " '40_cfd_os_2004_1201_141_402',\n", " '40_cfd_os_2004_1201_141_403',\n", " 'dnsn9010679',\n", " 'dnsn9010678',\n", " '40_cfd_os_2004_1201_141_408',\n", " '40_cfd_os_2004_1201_141_409',\n", " '6601455',\n", " 'highwagejobsinco00unit',\n", " 'fi4',\n", " '6601453',\n", " '6601452',\n", " 'whichemploy',\n", " 'fi9',\n", " '6601459',\n", " '501938444',\n", " '123955633',\n", " '43960370',\n", " 'onelincolnstreet00hmma',\n", " '723375',\n", " 'néill',\n", " 'phalluses',\n", " 'bustum',\n", " '237047684',\n", " '3a8f16a675b09218885',\n", " 'dfsd0405578',\n", " 'idx208420269484',\n", " '538677',\n", " '538675',\n", " '538674',\n", " '538673',\n", " '538672',\n", " '538671',\n", " '538670',\n", " 'embacy',\n", " 'llte1111',\n", " '538679',\n", " '538678',\n", " '6443891',\n", " '6443890',\n", " '6443893',\n", " '6443892',\n", " '6443895',\n", " '6443894',\n", " '6443897',\n", " '6443896',\n", " 'embach',\n", " '6443898',\n", " 'podali',\n", " 'dfst8600002',\n", " 'dfst8600003',\n", " 'footwashing',\n", " 'dfst8600001',\n", " 'dfst8600004',\n", " 'dnsc8703203',\n", " 'dnsc8703202',\n", " '0803728565',\n", " 'dnsc8703200',\n", " 'dnsc8703207',\n", " 'dnsc8703206',\n", " 'dnsc8703205',\n", " 'dnsc8703204',\n", " 'lettertodearaunt00west146',\n", " 'dnsc8703208',\n", " '5486051',\n", " 'idx208420165554',\n", " 'idx208420165555',\n", " 'idx208420165556',\n", " '67820610r',\n", " 'idx208420165550',\n", " 'idx208420165551',\n", " 'idx208420165552',\n", " 'idx208420165553',\n", " 'idx208420165558',\n", " 'idx208420165559',\n", " '39999066745066',\n", " '57368602',\n", " '0160708788',\n", " 'utarid',\n", " '728242073',\n", " '550828',\n", " 'putorious',\n", " 'utarin',\n", " 'hr096p2',\n", " 'berlingebrüder',\n", " 'ocm58431435',\n", " 'scrviot',\n", " 'dc10blk_c48061_o01',\n", " 'archonnov1923111dumm',\n", " 'agr65000117',\n", " '611902418',\n", " 'ranchlands',\n", " 'dfsd0505601',\n", " 'mw0398',\n", " 'mw0394',\n", " 'mw0395',\n", " 'mw0396',\n", " 'mw0397',\n", " 'mw0390',\n", " 'mw0392',\n", " 'mw0393',\n", " 'priiliitive',\n", " 'organisada',\n", " 'idx208420040645',\n", " 'chattenango',\n", " 'ocm57199627',\n", " 'annualreportnati19932nati',\n", " '755_131_015_01',\n", " '755_131_015_02',\n", " 'tobacco_nrp23e00',\n", " 'dayrl',\n", " '45199902',\n", " 'idx208420226025',\n", " 'lixuna',\n", " 'fineart',\n", " 'programlist00mass',\n", " 'b10828126',\n", " '03ad2af9e5e8a56ca6f69b3cc435e914',\n", " 'enfóquese',\n", " 'nomenclatoriszoo00agas',\n", " '766588',\n", " 'sermones04wycl',\n", " '7262462',\n", " '7262463',\n", " '7262460',\n", " '7262461',\n", " '7262466',\n", " '7262467',\n", " '7262464',\n", " '7262465',\n", " '7262468',\n", " '7262469',\n", " 'nxtilnd',\n", " 'ocn227183726',\n", " 'congenite',\n", " 'idx208420148768',\n", " 'idx208420148769',\n", " 'idx208420148766',\n", " 'idx208420148767',\n", " 'idx208420148764',\n", " 'idx208420148765',\n", " 'idx208420148762',\n", " 'idx208420148763',\n", " 'idx208420148760',\n", " 'idx208420148761',\n", " '826128635',\n", " 'claymoore',\n", " 'alleviations',\n", " 'contributiontohi01ridl',\n", " '12071628',\n", " 'magazijnvantuins15laar',\n", " 'investigationofcnyc0708unit',\n", " '454th',\n", " 'dnsc9108094',\n", " 'underband',\n", " 'achc172',\n", " 'compléte',\n", " 'annualreporttown1962huds',\n", " '0062_0053',\n", " '0062_0052',\n", " '0062_0051',\n", " '0062_0050',\n", " '0062_0057',\n", " '0062_0056',\n", " '0062_0055',\n", " '0062_0054',\n", " '0062_0059',\n", " '082833',\n", " '082836',\n", " 'in8st',\n", " '803985224',\n", " 'replytoharnackon00cremuoft',\n", " '200640',\n", " '200641',\n", " '200643',\n", " '200646',\n", " '200647',\n", " 'annualreport1908mass',\n", " 'ntdrmiyp',\n", " '39350564',\n", " '335648',\n", " 'bb3be5146b92278a48870308d6d82935',\n", " '885665870',\n", " 'r8026',\n", " 'archivfrnaturg7301berl',\n", " '57449076',\n", " 'dnsc9108090',\n", " 'ocm46705900',\n", " 'p0031_23621_0001',\n", " '6400344',\n", " '6400345',\n", " 'annualreportsoft1996stod',\n", " '6400347',\n", " '6400340',\n", " 'snrrey',\n", " '6400342',\n", " 'perlavenutainrom00pizz',\n", " '6400348',\n", " '6400349',\n", " 'bostoncollegemagsp1994bos',\n", " '676967',\n", " 'ahc099276002a',\n", " 'ocm50762106',\n", " 'ocm32263636',\n", " 'brucharzt',\n", " '19348145014000000000',\n", " '978984',\n", " '978987',\n", " '978982',\n", " '003753477',\n", " 'twk27811',\n", " 'vnrl',\n", " 'cbc48325_h02',\n", " 'vnrk',\n", " 'dfst9100038',\n", " 'gemonteerd',\n", " 'usparticipationi1993unit',\n", " 'ds01017',\n", " 'photomisc061',\n", " '000902464',\n", " '52913537',\n", " 'mueewscontains',\n", " '6473418',\n", " '6473419',\n", " '6473412',\n", " '6473413',\n", " '6473410',\n", " '6473411',\n", " '6473416',\n", " '6473417',\n", " '6473414',\n", " '6473415',\n", " '5865218',\n", " '5865219',\n", " '498_001',\n", " '34623874',\n", " '5865210',\n", " '5865211',\n", " '5865212',\n", " '5865213',\n", " '5865214',\n", " '5865215',\n", " '5865216',\n", " '438065950',\n", " '759866782',\n", " '427921282',\n", " '003_may',\n", " 'büderverzeichnis',\n", " '06_10_017847',\n", " '06_10_017846',\n", " '06_10_017845',\n", " '06_10_017844',\n", " '06_10_017843',\n", " '06_10_017842',\n", " '06_10_017841',\n", " '06_10_017840',\n", " 'rainmaking',\n", " '06_10_017849',\n", " '08_06_014939',\n", " '08_06_014938',\n", " '08_06_014937',\n", " '08_06_014936',\n", " '08_06_014935',\n", " '08_06_014934',\n", " '08_06_014933',\n", " '08_06_014932',\n", " '08_06_014931',\n", " '08_06_014930',\n", " 'ocm26865627',\n", " '00828702',\n", " '00828701',\n", " 'n24⁰',\n", " 'nouvellebiograph41hoef',\n", " 'alrso',\n", " 'annualreportofto2008unse',\n", " 'flnctnating',\n", " 'muskelbewegung',\n", " '14759262',\n", " 'ocm14847373',\n", " 'ecologicos',\n", " 'vegas_055',\n", " 'nfant',\n", " 'fws0bsusfi83140201usfi',\n", " 'eann',\n", " 'idx208420105650',\n", " 'idx208420105653',\n", " 'idx208420105652',\n", " 'idx208420105655',\n", " 'idx208420105657',\n", " 'idx208420105656',\n", " 'idx208420105659',\n", " 'idx208420105658',\n", " 'eana',\n", " '123915919',\n", " 'herthel',\n", " '20canyon',\n", " 'philipshandyatla00bart',\n", " 'pl10blk_c48185_000',\n", " '6890229',\n", " 'issliiiiiipo',\n", " 'tendenzdrama',\n", " 'translationsrepr00univ',\n", " 'swalllpy',\n", " '292504',\n", " 'm2g46',\n", " '292506',\n", " '292507',\n", " '93590',\n", " 'tchats',\n", " 'idx208420043302',\n", " 'idx208420043303',\n", " 'idx208420043300',\n", " 'idx208420043301',\n", " 'idx208420043306',\n", " 'idx208420043307',\n", " 'idx208420043304',\n", " 'idx208420043305',\n", " 'idx208420043308',\n", " 'idx208420043309',\n", " 'physeodesmos',\n", " 'ptbw_149',\n", " 'ptbw_148',\n", " '6514768',\n", " '6514769',\n", " 'ef9ce3ebeb45153cb2f8f310afa603e4',\n", " '6514764',\n", " '6514765',\n", " '6514766',\n", " '6514767',\n", " '6514760',\n", " '6514762',\n", " '6514763',\n", " '6371356',\n", " '6371357',\n", " '6371354',\n", " '6371355',\n", " '6371352',\n", " '6371353',\n", " '6371350',\n", " '6371351',\n", " 'ailsofi',\n", " 'prsity',\n", " '6371358',\n", " '6371359',\n", " 't35b',\n", " 't35c',\n", " '794278362',\n", " 'desso',\n", " '800957',\n", " 'capecodnationals222unit',\n", " 'boyd1905',\n", " '706503356',\n", " 'p0001_1136_18648_verso',\n", " '2007_8_1163',\n", " '234072517',\n", " '234072515',\n", " '39999065430165',\n", " 'pl10blk_c48185_004',\n", " 'historyofmoderns02ferg',\n", " 'dm0308',\n", " 'dm0309',\n", " 'dm0306',\n", " 'dm0305',\n", " 'dm0302',\n", " 'dm0303',\n", " 'dm0300',\n", " 'dm0301',\n", " 'reportofboardofm00mass_25',\n", " 'inoiiejr',\n", " 'reportofboardofm00mass_24',\n", " '180764899',\n", " '19348025650100000000',\n", " 'pedicellate',\n", " '098889',\n", " '851083009',\n", " '66_2_1_001',\n", " 'dasc8610641',\n", " 'dasc8610640',\n", " 'reportofboardofm00mass_23',\n", " 'кампании',\n", " 'ocm54515622',\n", " 'reportofboardofm00mass_22',\n", " '006275927',\n", " '46670027',\n", " '8c5cf2d331b244e18b0fd1adb787e260',\n", " 'dmsd0742448',\n", " '81983ac',\n", " 'samaritanchronic00josh',\n", " 'laukaan',\n", " 'universityofnort326univ',\n", " '42448916456601000000',\n", " 'totoramba',\n", " 'condev2848_overview',\n", " '522141',\n", " 'americanmedicalt2186unse',\n", " 'investigationofconc12unit',\n", " 'rightbrained',\n", " 'koekkoek',\n", " 'meramecgreenway00nati',\n", " '5228565',\n", " 'immmlmm',\n", " 'chonsia',\n", " 'rightlateralized',\n", " 'dfst8411099',\n", " 'dfst8411098',\n", " 'agr64000414',\n", " '19348209700200000000',\n", " 'nerinea',\n", " '800022',\n", " '81640821',\n", " 'idx208420177306',\n", " 'idx208420177307',\n", " 'idx208420177304',\n", " 'idx208420177305',\n", " 'idx208420177302',\n", " 'idx208420177303',\n", " 'idx208420177300',\n", " 'idx208420177301',\n", " 'archivfrmikros911berl',\n", " 'ocm44621113',\n", " 'iilacle',\n", " 'idx208420177309',\n", " 'marbledecoration00blag',\n", " '60694786',\n", " '318910995',\n", " '800023',\n", " 'ms_1604_mitigationhouse_2415',\n", " 'health_emphasis_program',\n", " 'idx208420258488',\n", " 'idx208420272199',\n", " 'idx208420272198',\n", " '800020',\n", " 'idx208420272195',\n", " 'idx208420272194',\n", " 'idx208420272197',\n", " 'idx208420272196',\n", " 'idx208420272191',\n", " 'idx208420272190',\n", " 'idx208420272193',\n", " 'idx208420272192',\n", " '00002466_tn_0001',\n", " 'newsbooks',\n", " '69952104',\n", " 'idx208420241588',\n", " 'idx208420241589',\n", " 'z203',\n", " 'idx208420241584',\n", " 'idx208420241586',\n", " 'idx208420241587',\n", " 'idx208420241580',\n", " 'idx208420241581',\n", " 'idx208420241582',\n", " 'idx208420241583',\n", " 'lltry',\n", " '6682510',\n", " '6682513',\n", " '6682515',\n", " '6682514',\n", " '6682517',\n", " 'acidlty',\n", " '6682519',\n", " '6682518',\n", " 'ahc167002032a',\n", " 'machery',\n", " 'researchpaper22nort',\n", " 'bibliapauperumco00unwi',\n", " '05000us48177m',\n", " 'soang',\n", " '206543',\n", " 'lettertodeardebo00chap',\n", " 'catanaei',\n", " '167656',\n", " '39999065612291',\n", " 'dnsn9804200',\n", " 'dfsd0202499',\n", " '06159200',\n", " 'dfsd0202498',\n", " '54490606',\n", " '31232081',\n", " 'universityofnort224univ',\n", " '800024',\n", " '35581049',\n", " '40152221',\n", " '00007852_tn_0001',\n", " 'studentsguidetod00nettuoft',\n", " 'fih',\n", " 'dfsd0202492',\n", " 'dentair',\n", " 'fik',\n", " '38596296',\n", " '97643431',\n", " '00002319_tn_0001',\n", " 'dfsd0202497',\n", " '19571027',\n", " '39999057055798',\n", " 'dfsd0202496',\n", " 'researchpapers81inte',\n", " 'cereti',\n", " 'kalokoe',\n", " 'dfsd0202495',\n", " '261710',\n", " 'dscn1922',\n", " 'cerete',\n", " '05067100',\n", " '13729521',\n", " 'censusofbusiness1952unit',\n", " 'cbs4893370_001',\n", " 'vorcl',\n", " 'instiu',\n", " 'evaluaciónes',\n", " 'ohtaiu',\n", " '65820290r',\n", " '19348199005000000000',\n", " '1049118',\n", " '48999328',\n", " '006355217',\n", " 'grifflin',\n", " 'ocm47248522',\n", " 'exobiologyineart00ames',\n", " 'armymedicalmuseumcollectioncatalogueofpathologicaldrawingsofmedical',\n", " '225867405',\n", " '6003189',\n", " '6003188',\n", " '6003187',\n", " '6003186',\n", " '39999063171092',\n", " '6003184',\n", " '6003183',\n", " '6003182',\n", " '6003181',\n", " '6003180',\n", " 'dnsn8604075',\n", " 'dnsn8604074',\n", " 'reportofdanishbi10dans',\n", " 'dnsn8604076',\n", " 'dnsn8604071',\n", " 'dnsn8604070',\n", " 'dnst8208141',\n", " 'athénée',\n", " 'dnst8208144',\n", " 'ptbw_209',\n", " '147279',\n", " 'shadscaje',\n", " 'abrahamvs',\n", " '6361628',\n", " 'marign',\n", " '6460578',\n", " '147271',\n", " '147270',\n", " '6361621',\n", " '147272',\n", " '6361627',\n", " '6361626',\n", " '6361625',\n", " '6361624',\n", " 'illspi',\n", " 'oceanographicobs1971moyn',\n", " 'ptbw_206',\n", " 'b20386424',\n", " 'idx208420027629',\n", " 'idx208420027628',\n", " 'idx208420027626',\n", " 'idx208420027625',\n", " 'idx208420027624',\n", " 'idx208420027623',\n", " 'idx208420027622',\n", " 'idx208420027621',\n", " 'ncreased',\n", " 'poorpotterofyork01bark',\n", " '6460570',\n", " 'lq9ic',\n", " '2011506740',\n", " '2011506741',\n", " '2011506746',\n", " '6460573',\n", " 'lsouth',\n", " 'charruaud',\n", " '729885960',\n", " 'moanavilla',\n", " '755_029_012_01',\n", " 'sumner138_2_1_120c1',\n", " 'suiuciuc',\n", " 'idx208420243402',\n", " '6460576',\n", " 'fillallcial',\n", " 'lytleton',\n", " 'portrteeiniger00riga',\n", " 'idx208420207973',\n", " '002590',\n", " 'catalogueofstate1863stat',\n", " '277_neuve214999',\n", " 'idx208420017259',\n", " 'idx208420017258',\n", " '6401770',\n", " '6401776',\n", " '6401777',\n", " '6401774',\n", " 'dnsn9010670',\n", " 'idx208420017251',\n", " 'idx208420017250',\n", " 'idx208420017252',\n", " 'idx208420017255',\n", " 'idx208420017254',\n", " 'idx208420017257',\n", " 'idx208420017256',\n", " '40_cfd_os_2004_1201_141_407',\n", " 'dnsn9010675',\n", " 'preinstallation',\n", " '002597',\n", " 'twk61259',\n", " 'twk61254',\n", " 'twk61255',\n", " 'twk61256',\n", " 'twk61257',\n", " 'twk61250',\n", " 'twk61251',\n", " 'twk61252',\n", " 'twk61253',\n", " 'americanfarmer2425balt',\n", " 'pendleys',\n", " 'coustet',\n", " '009917303',\n", " 'blapey',\n", " '754123',\n", " 'oubrerie',\n", " 'fs200112',\n", " '36595169',\n", " 'fs200117',\n", " '6601454',\n", " 'fs200119',\n", " 'nazvy',\n", " '6601457',\n", " '810216859',\n", " 'ocm32849495',\n", " '6601456',\n", " '6601451',\n", " '943296',\n", " '6601450',\n", " '94025092',\n", " 'idx208420243404',\n", " 'histoirenaturell00less',\n", " 'buonconsiglio',\n", " '7f97e8ebc56f3b036fbaba41f511ef14',\n", " '50141339',\n", " '14200136',\n", " '42148375002000000000',\n", " 'laueblose',\n", " '3f⁸',\n", " '6601458',\n", " '5623098',\n", " '5623099',\n", " '5623092',\n", " '5623093',\n", " '5623090',\n", " '5623091',\n", " '5623096',\n", " 'i1r11',\n", " '5623094',\n", " '5623095',\n", " 'eiiiaient',\n", " 'paternitiy',\n", " '49214378',\n", " '06801433',\n", " 'annualreportcolu1944colu',\n", " 'idx208420243409',\n", " '1039100',\n", " 'pocketalmanackfo1807amer',\n", " 'guilfordcollegi519021903',\n", " '001326612',\n", " '855883',\n", " 'lehrlings',\n", " 'internationalesa04inte',\n", " 'dnsc9401209',\n", " '41382262',\n", " '04_1of2_dsc_2693',\n", " '04372800',\n", " 'doi_2770',\n", " '06538300',\n", " '62_5_9_001',\n", " '173219654',\n", " 'uspharmacopoeiai00slsnuoft',\n", " '42448918750060000000',\n", " 'roffler',\n", " 'hydrabiologie',\n", " '05236200',\n", " 'nouvellehyginede00tave',\n", " 'yonaknoka',\n", " '708252503',\n", " 'ocm50037038',\n", " '0160754798',\n", " 'stakelin',\n", " '5fgeneral',\n", " '217821',\n", " '217822',\n", " '217823',\n", " '217824',\n", " '217825',\n", " '315850782',\n", " 'zeitschriftfrele16elek',\n", " 'umn23361b',\n", " 'cyclopaediaofpra11ziemuoft',\n", " '201416',\n", " '38994315',\n", " 'isetan',\n", " 'hotisse',\n", " 'cinique',\n", " '828288556',\n", " 'bogasse',\n", " '73272738',\n", " '39999065840397',\n", " 'kisima',\n", " 'whitemartinsgeni00whituoft',\n", " '719450300',\n", " '320456955',\n", " '5823242',\n", " '10760193',\n", " '9780160683268',\n", " 't45nr23e',\n", " '6663688',\n", " '6663689',\n", " '6663684',\n", " '6663685',\n", " '6663686',\n", " '6663687',\n", " '6663680',\n", " '6663681',\n", " '6663682',\n", " '6663683',\n", " '5613176',\n", " 'dnsd0311938',\n", " '753739604',\n", " 'bulletindelasoci311892soci',\n", " '449254',\n", " '2568052r',\n", " 'idx208420073510',\n", " 'dnst9101238',\n", " 'idx208420073511',\n", " 'dnst9101239',\n", " 'idx208420073512',\n", " 'dnst9101236',\n", " 'idx208420073513',\n", " '5613170',\n", " 'dnsd0311932',\n", " '6428023',\n", " 'dnst9101234',\n", " 'condev7577e',\n", " 'dnst9101235',\n", " 'idx208420073516',\n", " 'catalogueofficiel00expo',\n", " 'dnsd0311937',\n", " 'idx208420073517',\n", " 'annualreportofto1993unse',\n", " 'dnsd0311936',\n", " 'hightley',\n", " 'dnst9101230',\n", " '03229a',\n", " 'dnst9101231',\n", " '51277834',\n", " '6412729',\n", " 'ʹemile',\n", " 'staqe',\n", " 'wadowice',\n", " 'motilitydisturbance',\n", " '279240',\n", " '279242',\n", " '279244',\n", " '279247',\n", " 'idx208420045494',\n", " 'idx208420045495',\n", " 'idx208420045496',\n", " 'idx208420045497',\n", " 'idx208420045490',\n", " 'idx208420045491',\n", " 'idx208420045492',\n", " 'idx208420045493',\n", " 'cnnnor',\n", " 'idx208420045498',\n", " 'idx208420045499',\n", " 'iilpgs',\n", " 'va2259',\n", " 'gainestsown',\n", " '695055989',\n", " '464629053',\n", " '38136000196400',\n", " '85833201',\n", " '6472240',\n", " '6472241',\n", " '6472242',\n", " '6472243',\n", " '6472244',\n", " '6472245',\n", " '6472246',\n", " '6472247',\n", " '6472248',\n", " '6472249',\n", " '6600241',\n", " 'dx18780838',\n", " '00006612',\n", " '2412962',\n", " '00006615',\n", " '098231',\n", " 'nordhang',\n", " '713205230',\n", " '6600246',\n", " '01758900',\n", " '779978829',\n", " 'overcored',\n", " '00006616',\n", " 'paperspecs',\n", " '244008156',\n", " 'photosynthesis00spoe',\n", " 'dfsd0509278',\n", " '2065802',\n", " '7308832',\n", " 'lnrvnl',\n", " '1891mc',\n", " '88699',\n", " 'b1162353',\n", " 'lienor',\n", " 'annualreportfort1941bedf',\n", " 'eppsit',\n", " '52472843',\n", " '6523569',\n", " '85119',\n", " '8869c',\n", " '85116',\n", " '85117',\n", " '85114',\n", " '85115',\n", " 'd9f2d0667a199d15e0cc709874d89bb6',\n", " '765950',\n", " '2261177',\n", " 'lnnlity',\n", " 'siybgaalcteu',\n", " '109007',\n", " 'shmagin',\n", " '147974229',\n", " 'heredero',\n", " '230937142',\n", " '827199896',\n", " 'idx208420149314',\n", " 'idx208420149315',\n", " 'idx208420149316',\n", " 'idx208420149317',\n", " 'idx208420149310',\n", " 'musiclicensingsm00unit',\n", " 'idx208420149312',\n", " 'idx208420149313',\n", " '254209',\n", " 'cueur',\n", " 'idx208420149318',\n", " 'idx208420149319',\n", " '233697515',\n", " '7312413',\n", " '7312412',\n", " '7312411',\n", " 'fficiency',\n", " '7312417',\n", " '7312416',\n", " '7312415',\n", " '7312419',\n", " 'annualreporttown1960cent',\n", " 'caaperating',\n", " '291157',\n", " '04966000',\n", " 'histoireabrg00joll',\n", " 'psyamericanjourn33ameruoft',\n", " 'präparator',\n", " '61232861',\n", " '116782',\n", " 'problemofevilinp00full',\n", " 'dnsc8703201',\n", " 'idx208420042778',\n", " 'idx208420042779',\n", " 'a946',\n", " 'a940',\n", " 'a941',\n", " 'a942',\n", " 'a943',\n", " 'idx208420042770',\n", " 'idx208420042771',\n", " 'idx208420042772',\n", " 'idx208420042773',\n", " 'idx208420042774',\n", " 'perjudicados',\n", " 'idx208420042776',\n", " 'idx208420042777',\n", " 'establishplanned00bost',\n", " '294656',\n", " '294654',\n", " '294652',\n", " '294653',\n", " '294650',\n", " '294651',\n", " 'ascaláfidos',\n", " '06065200',\n", " 'livernois',\n", " '294658',\n", " '294659',\n", " 'henryviiienglish01gasq',\n", " 'va2253',\n", " '0160765609',\n", " 'economy4',\n", " ...]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fd.hapaxes()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def hasNumbers(inputString):\n", " return any(char.isdigit() for char in inputString)\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "392117" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "texthaps = []\n", "for hap in fd.hapaxes():\n", " if not hasNumbers(hap):\n", " texthaps.append(hap)\n", "len(texthaps)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2773182" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(fd.hapaxes())" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['contiiinecl',\n", " 'greise',\n", " 'noveluber',\n", " 'afgestooken',\n", " 'wintkagasspan',\n", " 'choegun',\n", " 'meeler',\n", " 'underbank',\n", " 'ufers',\n", " 'riilcil',\n", " 'qurxtity',\n", " 'ficantly',\n", " 'racticul',\n", " 'wageindex',\n", " 'bankroll',\n", " 'fiy',\n", " 'peble',\n", " 'colvtalner',\n", " 'manipulaton',\n", " 'todod',\n", " 'whichemploy',\n", " 'néill',\n", " 'phalluses',\n", " 'bustum',\n", " 'embacy',\n", " 'embach',\n", " 'podali',\n", " 'footwashing',\n", " 'utarid',\n", " 'putorious',\n", " 'utarin',\n", " 'berlingebrüder',\n", " 'scrviot',\n", " 'ranchlands',\n", " 'priiliitive',\n", " 'organisada',\n", " 'chattenango',\n", " 'dayrl',\n", " 'lixuna',\n", " 'fineart',\n", " 'enfóquese',\n", " 'nxtilnd',\n", " 'congenite',\n", " 'claymoore',\n", " 'alleviations',\n", " 'underband',\n", " 'compléte',\n", " 'ntdrmiyp',\n", " 'snrrey',\n", " 'brucharzt',\n", " 'vnrl',\n", " 'vnrk',\n", " 'gemonteerd',\n", " 'mueewscontains',\n", " 'büderverzeichnis',\n", " 'rainmaking',\n", " 'alrso',\n", " 'flnctnating',\n", " 'muskelbewegung',\n", " 'ecologicos',\n", " 'nfant',\n", " 'eann',\n", " 'eana',\n", " 'herthel',\n", " 'issliiiiiipo',\n", " 'tendenzdrama',\n", " 'swalllpy',\n", " 'tchats',\n", " 'physeodesmos',\n", " 'ailsofi',\n", " 'prsity',\n", " 'desso',\n", " 'inoiiejr',\n", " 'pedicellate',\n", " 'кампании',\n", " 'laukaan',\n", " 'totoramba',\n", " 'rightbrained',\n", " 'koekkoek',\n", " 'immmlmm',\n", " 'chonsia',\n", " 'rightlateralized',\n", " 'nerinea',\n", " 'iilacle',\n", " 'health_emphasis_program',\n", " 'newsbooks',\n", " 'lltry',\n", " 'acidlty',\n", " 'machery',\n", " 'soang',\n", " 'catanaei',\n", " 'fih',\n", " 'dentair',\n", " 'fik',\n", " 'cereti',\n", " 'kalokoe',\n", " 'cerete',\n", " 'vorcl',\n", " 'instiu',\n", " 'evaluaciónes',\n", " 'ohtaiu',\n", " 'grifflin',\n", " 'armymedicalmuseumcollectioncatalogueofpathologicaldrawingsofmedical',\n", " 'athénée',\n", " 'shadscaje',\n", " 'abrahamvs',\n", " 'marign',\n", " 'illspi',\n", " 'ncreased',\n", " 'lsouth',\n", " 'charruaud',\n", " 'moanavilla',\n", " 'suiuciuc',\n", " 'fillallcial',\n", " 'lytleton',\n", " 'preinstallation',\n", " 'pendleys',\n", " 'coustet',\n", " 'blapey',\n", " 'oubrerie',\n", " 'nazvy',\n", " 'buonconsiglio',\n", " 'laueblose',\n", " 'eiiiaient',\n", " 'paternitiy',\n", " 'lehrlings',\n", " 'roffler',\n", " 'hydrabiologie',\n", " 'yonaknoka',\n", " 'stakelin',\n", " 'isetan',\n", " 'hotisse',\n", " 'cinique',\n", " 'bogasse',\n", " 'kisima',\n", " 'hightley',\n", " 'ʹemile',\n", " 'staqe',\n", " 'wadowice',\n", " 'motilitydisturbance',\n", " 'cnnnor',\n", " 'iilpgs',\n", " 'gainestsown',\n", " 'nordhang',\n", " 'overcored',\n", " 'paperspecs',\n", " 'lnrvnl',\n", " 'lienor',\n", " 'eppsit',\n", " 'lnnlity',\n", " 'siybgaalcteu',\n", " 'shmagin',\n", " 'heredero',\n", " 'cueur',\n", " 'fficiency',\n", " 'caaperating',\n", " 'präparator',\n", " 'perjudicados',\n", " 'ascaláfidos',\n", " 'livernois',\n", " 'yamasukera',\n", " 'saurole',\n", " 'antsiraben',\n", " 'ahimatitsy',\n", " 'goelawa',\n", " 'lusembe',\n", " 'agrcelnent',\n", " 'maurain',\n", " 'andana',\n", " 'bucassen',\n", " 'hcving',\n", " 'invigorators',\n", " 'natalbai',\n", " 'clvc',\n", " 'caarriecl',\n", " 'efugee',\n", " 'wahasha',\n", " 'clisenscs',\n", " 'schilcrat',\n", " 'ruvin',\n", " 'calmcit',\n", " 'eazey',\n", " 'hydrophobicities',\n", " 'llcigl',\n", " 'sincan',\n", " 'cesana',\n", " 'missionassisant',\n", " 'yecorato',\n", " 'sellare',\n", " 'tibetische',\n", " 'presidentswashington',\n", " 'tehefoucte',\n", " 'ethnomedicine',\n", " 'tasmanien',\n", " 'saimdang',\n", " 'linbs',\n", " 'harlaire',\n", " 'ethiop',\n", " 'bluehill',\n", " 'kafri',\n", " 'tekau',\n", " 'hurney',\n", " 'tioton',\n", " 'tekam',\n", " 'portaient',\n", " 'ssistetla',\n", " 'ghottingen',\n", " 'circuito',\n", " 'kaiawase',\n", " 'faucettralphebiography',\n", " 'definciency',\n", " 'belolawek',\n", " 'longnor',\n", " 'pilosebaceous',\n", " 'stoillach',\n", " 'msun',\n", " 'nlcclical',\n", " 'throi',\n", " 'kaululaau',\n", " 'throa',\n", " 'throc',\n", " 'антоновские',\n", " 'illatter',\n", " 'bollwyller',\n", " 'winkelmannischen',\n", " 'drycell',\n", " 'treff',\n", " 'yull',\n", " 'cialized',\n", " 'dahcotah',\n", " 'komarovʺ',\n", " 'rossolimo',\n", " 'thousaild',\n", " 'lognette',\n", " 'galgesberg',\n", " 'umbeluzi',\n", " 'prorluct',\n", " 'stephanius',\n", " 'castellamonte',\n", " 'lewice',\n", " 'wojtosik',\n", " 'abdeldaim',\n", " 'haitt',\n", " 'haita',\n", " 'schnée',\n", " 'gathr',\n", " 'piranema',\n", " 'lucrene',\n", " 'godov',\n", " 'hyten',\n", " 'jadot',\n", " 'brsorc',\n", " 'eingaklebt',\n", " 'swotting',\n", " 'kistenmacher',\n", " 'tichig',\n", " 'lliliversity',\n", " 'chimneystacks',\n", " 'dandolo',\n", " 'shevistskikh',\n", " 'gentisate',\n", " 'semiofficiul',\n", " 'mearsheimer',\n", " 'jounrnal',\n", " 'waliace',\n", " 'wahrhafftige',\n", " 'rechardson',\n", " 'turnikom',\n", " 'niounttlins',\n", " 'baramadagascar',\n", " 'ressainance',\n", " 'janorschke',\n", " 'аграрной',\n", " 'editnr',\n", " 'upholstory',\n", " 'rcjuis',\n", " 'mormoopidae',\n", " 'dipsogenic',\n", " 'ŭihoe',\n", " 'salomonsen',\n", " 'crames',\n", " 'apalaches',\n", " 'mcotea',\n", " 'sagacitate',\n", " 'fringy',\n", " 'seminaristinnenkurs',\n", " 'encouragers',\n", " 'alî',\n", " 'agmiller',\n", " 'morzon',\n", " 'insitituto',\n", " 'regentibus',\n", " 'strandmark',\n", " 'ladhoff',\n", " 'иностранным',\n", " 'créance',\n", " 'branty',\n", " 'nncienl',\n", " 'pietruszo',\n", " 'reous',\n", " 'pietrusza',\n", " 'requring',\n", " 'eigl',\n", " 'eigo',\n", " 'sinamary',\n", " 'dashino',\n", " 'dorpsoverste',\n", " 'eevvaannggeelliizziinngg',\n", " 'lloyte',\n", " 'wwod',\n", " 'ontiberos',\n", " 'cqulvnlent',\n", " 'espero',\n", " 'photogrammety',\n", " 'ulst',\n", " 'multifire',\n", " 'cuicado',\n", " 'adamsone',\n", " 'dcvclqpwnt',\n", " 'aspian',\n", " 'adamsons',\n", " 'converty',\n", " 'travleing',\n", " 'dendale',\n", " 'cveli',\n", " 'ashurnasirpal',\n", " 'lsbilitgo',\n", " 'abgerungen',\n", " 'tacting',\n", " 'reeneng',\n", " 'nucleotidases',\n", " 'alz',\n", " 'diaconia',\n", " 'langnaw',\n", " 'phorbia',\n", " 'schwst',\n", " 'mediatii',\n", " 'einschlieslich',\n", " 'losleben',\n", " 'withsubsequent',\n", " 'boxman',\n", " 'gewonnener',\n", " 'hensry',\n", " 'bruti',\n", " 'gefährligkeiten',\n", " 'rhayader',\n", " 'albenseiten',\n", " 'reloplnent',\n", " 'approsimately',\n", " 'vlttu',\n", " 'louzon',\n", " 'gloghini',\n", " 'reasest',\n", " 'diigital',\n", " 'chúng',\n", " 'junkets',\n", " 'glenrosa',\n", " 'harmount',\n", " 'abarms',\n", " 'wawuna',\n", " 'hogards',\n", " 'delinavit',\n", " 'ē',\n", " 'solvccl',\n", " 'lumutbalai',\n", " 'sigananda',\n", " 'martargis',\n", " 'iaensc',\n", " 'friedrichstafen',\n", " 'elyah',\n", " 'lililyo',\n", " 'hungerpest',\n", " 'shaijples',\n", " 'estioin',\n", " 'akomatsu',\n", " 'latil',\n", " 'unfought',\n", " 'kasavubu',\n", " 'rited',\n", " 'disfrutara',\n", " 'piedmonth',\n", " 'insepector',\n", " 'latiu',\n", " 'wanzhou',\n", " 'вапко',\n", " 'claii',\n", " 'tomologyw',\n", " 'claie',\n", " 'thatsächliche',\n", " 'braunschw',\n", " 'tlcleterious',\n", " 'comwesseafron',\n", " 'cliscoveri',\n", " 'cliscoverg',\n", " 'cliscoverd',\n", " 'onyefulu',\n", " 'eggsquisite',\n", " 'binjouin',\n", " 'vatundamu',\n", " 'lugee',\n", " 'merpeople',\n", " 'fcedings',\n", " 'jaywalkers',\n", " 'dllrille',\n", " 'durres',\n", " 'durret',\n", " 'ungen',\n", " 'ungeh',\n", " 'suerfu',\n", " 'ungef',\n", " 'fischtrockenplatz',\n", " 'amaudruz',\n", " 'atact',\n", " 'podepsa',\n", " 'citlzens',\n", " 'hoëvell',\n", " 'feteioa',\n", " 'jsteiv',\n", " 'pánico',\n", " 'sosikrates',\n", " 'collectivized',\n", " 'gülden',\n", " 'ordinavit',\n", " 'asxi',\n", " 'intimiano',\n", " 'ctzel',\n", " 'boutot',\n", " 'murza',\n", " 'celebrees',\n", " 'halacsy',\n", " 'obsèques',\n", " 'availnl',\n", " 'arcandi',\n", " 'cidio',\n", " 'consecta',\n", " 'soopahya',\n", " 'houtton',\n", " 'yungon',\n", " 'breissgauischen',\n", " 'suhe',\n", " 'suho',\n", " 'suhs',\n", " 'sanitio',\n", " 'infrmatin',\n", " 'romatipografia',\n", " 'xianwu孔宪武',\n", " 'gijsbertus',\n", " 'propolae',\n", " 'gastrophod',\n", " 'medisons',\n", " 'accão',\n", " 'probates',\n", " 'vergier',\n", " 'головину',\n", " 'tjlu',\n", " 'syncopalis',\n", " 'tjll',\n", " 'tjli',\n", " 'tenthouses',\n", " 'heldenbuch',\n", " 'nonunions',\n", " 'beets_______________________________________________',\n", " 'sitllation',\n", " 'muckerman',\n", " 'illvasioli',\n", " 'spaski',\n", " 'dascalescu',\n", " 'belgariad',\n", " 'pothicary',\n", " 'oouurrccuullttuurree',\n", " 'плеть',\n", " 'fogva',\n", " 'noening',\n", " 'uncentralized',\n", " 'zuydt',\n", " 'ambitiously',\n", " 'cllallges',\n", " 'ivifbboobrrrniiiggiinngg',\n", " 'mgney',\n", " 'babungokind',\n", " 'iinetl',\n", " 'bioisosteres',\n", " 'pressurevolume',\n", " 'isurumunija',\n", " 'solfeggio',\n", " 'illformed',\n", " 'merdin',\n", " 'dannal',\n", " 'dannam',\n", " 'cakenge',\n", " 'merhandise',\n", " 'dannat',\n", " 'vniuersis',\n", " 'magnética',\n", " 'caxcan',\n", " 'cjrried',\n", " 'cessing',\n", " 'oeurs',\n", " 'investmerrts',\n", " 'götzenfesten',\n", " 'lnental',\n", " 'mdclxxxxix',\n", " 'essercitato',\n", " 'munduruku',\n", " 'atomgruppen',\n", " 'женской',\n", " 'copiè',\n", " 'benicht',\n", " 'sairsc',\n", " 'genvinam',\n", " 'thehvside',\n", " 'rachitogenesis',\n", " 'aclminislration',\n", " 'koepf',\n", " 'kift',\n", " 'detenninecl',\n", " 'horloger',\n", " 'naudaei',\n", " 'magaro',\n", " 'limbowe',\n", " 'abegawa',\n", " 'mahemiah',\n", " 'bakevellia',\n", " '牧草大田轮作制的理论与技术',\n", " 'canaletti',\n", " 'conrltr',\n", " 'failittg',\n", " 'patert',\n", " 'ganisnl',\n", " 'optitrack',\n", " 'stationsgeschwister',\n", " 'sonderbahre',\n", " 'reestab',\n", " 'tranvik',\n", " 'ericksonhurt',\n", " 'ayudarán',\n", " 'superintendint',\n", " 'cfveral',\n", " 'bisignano',\n", " 'lagrassa',\n", " 'hwadlefy',\n", " 'prrat',\n", " 'dissapproval',\n", " 'platenses',\n", " 'gorean',\n", " 'oolanahhee',\n", " 'snni',\n", " 'yuj',\n", " 'riječi',\n", " 'friedensengel',\n", " 'shoort',\n", " 'pyonchan',\n", " 'oncepts',\n", " 'flatcl',\n", " 'typper',\n", " 'sarcamento',\n", " 'scecam',\n", " 'elllenton',\n", " 'grufferman',\n", " 'nerby',\n", " 'sabeundi',\n", " 'hulsobus',\n", " 'nicklos',\n", " 'hallengeso',\n", " 'thâeophile',\n", " 'hypother',\n", " 'angiolini',\n", " 'cuthrie',\n", " 'sorgfältigste',\n", " 'gretehelskov',\n", " 'latabll',\n", " 'petroiacomo',\n", " 'cltlily',\n", " 'prnited',\n", " 'bejewelled',\n", " 'charmber',\n", " 'streynsham',\n", " 'mangaoang',\n", " 'marantearum',\n", " 'inocultztio',\n", " 'bescherme',\n", " 'surrouncled',\n", " 'ohrloff',\n", " 'luggs',\n", " 'kršćanskom',\n", " 'lugga',\n", " 'irlcilils',\n", " 'heavenbeijingming',\n", " 'frommarch',\n", " 'wasenberg',\n", " 'augustln',\n", " 'shodows',\n", " 'venkatanarasimha',\n", " 'mavjen',\n", " 'tenox',\n", " 'eiran',\n", " 'vnccint',\n", " 'tabyshalieva',\n", " 'millwr',\n", " 'prevendado',\n", " 'millwn',\n", " 'wärmflaschen',\n", " 'lillypad',\n", " 'pharisiens',\n", " 'trsiiive',\n", " 'accwnul',\n", " 'belcano',\n", " 'nitwi',\n", " 'menaham',\n", " 'meditsinskii',\n", " 'vigileo',\n", " 'kressner',\n", " 'efforcaient',\n", " 'fulanis',\n", " 'sorgotten',\n", " 'faciendo',\n", " 'sanshui',\n", " 'facienda',\n", " 'zeën',\n", " 'shacleford',\n", " 'parguasa',\n", " 'jtiaidia',\n", " 'videofile',\n", " 'kuchni',\n", " 'tanceac',\n", " 'bonifont',\n", " 'unatineg',\n", " 'consvls',\n", " 'sutek',\n", " 'norresundby',\n", " 'originalgrösse',\n", " 'specialistsâ',\n", " 'pflycbolojfy',\n", " 'lopardi',\n", " 'mjesta',\n", " 'yokevich',\n", " 'strafeships',\n", " 'mahachan',\n", " 'plistodon',\n", " 'ficticsoosi',\n", " 'psnlts',\n", " 'weegar',\n", " 'dobrego',\n", " 'iderable',\n", " 'cotapino',\n", " 'oversllpply',\n", " 'octaethylporphyrinatomanganese',\n", " 'iderably',\n", " 'signiiicnnt',\n", " 'robbards',\n", " 'leimentoll',\n", " 'lamroena',\n", " 'noncompact',\n", " 'ryckère',\n", " 'primaveral',\n", " 'gnieznienskiego',\n", " 'tamikas',\n", " 'grenony',\n", " 'bactcria',\n", " 'gastroli',\n", " 'ocotal',\n", " 'riconoscenza',\n", " 'monoarticular',\n", " 'bulletinsummerquappa',\n", " 'crotzer',\n", " 'utilus',\n", " 'talija',\n", " 'fereeda',\n", " 'kleinster',\n", " 'kleinstes',\n", " 'promnix',\n", " 'rusciano',\n", " 'chrisant',\n", " 'misshappen',\n", " 'oinicn',\n", " 'связанных',\n", " 'harrewyn',\n", " 'phlegm',\n", " 'skupljena',\n", " 'epidemiologi',\n", " 'imags',\n", " 'berliz',\n", " 'sheelat',\n", " 'descobertos',\n", " 'unreviewed',\n", " 'caesareansectionatfullterm',\n", " 'justitiam',\n", " 'rairden',\n", " 'strakville',\n", " 'vavenby',\n", " 'unmt',\n", " 'tuscapampa',\n", " 'morlund',\n", " 'hrougll',\n", " 'cluestionable',\n", " 'kupecký',\n", " 'vebred',\n", " 'guican',\n", " 'lastboote',\n", " 'nitrogenn',\n", " 'mewaygo',\n", " 'nitrogenj',\n", " 'womenßs',\n", " 'turistjcgob',\n", " 'explicatifs',\n", " 'cuyubini',\n", " 'carwrecks',\n", " 'korlet',\n", " 'nalyo',\n", " 'parasitarias',\n", " 'auristes',\n", " 'cawte',\n", " 'mitlli',\n", " 'clairinghnigh',\n", " 'gierish',\n", " 'estanco',\n", " 'ugel',\n", " 'missionsalmosen',\n", " 'uger',\n", " 'ussie',\n", " 'choquita',\n", " 'factaor',\n", " 'intermecliate',\n", " 'văn',\n", " 'tempelterrasse',\n", " 'multisynaptic',\n", " 'palmerly',\n", " 'malakhovskago',\n", " 'cheother',\n", " 'tremellen',\n", " 'diputndo',\n", " 'uvum',\n", " 'garwicz',\n", " 'tsournos',\n", " 'licntion',\n", " 'bauant',\n", " 'виссарионович',\n", " 'powdilfhorn',\n", " 'murphay',\n", " 'geroll',\n", " 'feyken',\n", " 'fromrichard',\n", " 'eastabrooks',\n", " 'jugali',\n", " 'mégaptère',\n", " 'utlis',\n", " 'xsoiirces',\n", " 'mgct',\n", " 'abulencia',\n", " 'marindin',\n", " 'blumenga',\n", " 'brethaur',\n", " 'guitele',\n", " 'dégré',\n", " 'potjokonkong',\n", " 'erysipela',\n", " 'variodermite',\n", " 'depiciences',\n", " 'shadkhen',\n", " 'catechetische',\n", " 'owyhigh',\n", " 'moutliecl',\n", " 'scherschligt',\n", " 'soudé',\n", " 'valbracht',\n", " 'historienschreiber',\n", " 'zamparelli',\n", " 'foulc',\n", " 'lecram',\n", " 'poespa',\n", " 'hospsital',\n", " 'veteriizary',\n", " 'ezekeli',\n", " 'unqualifiecl',\n", " 'eloya',\n", " 'succer',\n", " 'reginarum',\n", " 'succee',\n", " 'bildenback',\n", " 'chahogum',\n", " 'generalate',\n", " 'ordensmönchen',\n", " 'karioglu',\n", " 'proverbia',\n", " 'élisabeth',\n", " 'doncqu',\n", " 'publisherl',\n", " 'bameileke',\n", " 'fannel',\n", " 'alligence',\n", " 'prevellts',\n", " 'aassembly',\n", " 'enroller',\n", " 'metiokochda',\n", " 'trumall',\n", " 'killbourn',\n", " 'tailaferro',\n", " 'enforcerneiit',\n", " 'linggau',\n", " 'baranovski',\n", " 'xenpang',\n", " 'etymologicas',\n", " 'ahoo',\n", " 'officemax',\n", " 'thrach',\n", " 'fiskerjenten',\n", " 'düsseldorff',\n", " 'fosfatado',\n", " 'nechl',\n", " 'necho',\n", " 'ne_bras_ka',\n", " 'cosp',\n", " 'steaminj',\n", " 'mlekopitaiushchi',\n", " 'kodachromes',\n", " 'tyrolis',\n", " 'laminoplasty',\n", " 'mcgegor',\n", " 'lakeoff',\n", " 'asthenospheric',\n", " 'robatzek',\n", " 'privileres',\n", " 'yurugi',\n", " 'gmj',\n", " 'gmn',\n", " 'gmb',\n", " 'uppon',\n", " 'cuddapan',\n", " 'terrerium',\n", " 'ninnigret',\n", " 'mutilados',\n", " 'strandtmann',\n", " 'wäscherei',\n", " 'hilalcement',\n", " 'собраны',\n", " 'graspers',\n", " 'gesetzgebungen',\n", " 'benandri',\n", " 'masonthe',\n", " 'rusticity',\n", " 'sencoes',\n", " 'maineman',\n", " 'enemecio',\n", " 'foldwer',\n", " 'brimsby',\n", " 'convience',\n", " 'triunfalel',\n", " 'marshmellow',\n", " 'mtji',\n", " 'ebraeorvm',\n", " 'stwrt',\n", " 'leptandrin',\n", " 'samilkameenensis',\n", " 'zaander',\n", " 'unple',\n", " 'rrelelv',\n", " 'melothian',\n", " 'rorich',\n", " 'deprllcls',\n", " 'cwky',\n", " 'awesomely',\n", " 'accommodationist',\n", " 'dilena',\n", " 'silkrree',\n", " 'intrabead',\n", " 'armymedicalmuseumcollectionlogbooksprovisionalpathologicalseries',\n", " 'padoukholz',\n", " 'lienholders',\n", " 'beethom',\n", " 'organian',\n", " 'legary',\n", " 'rofewors',\n", " 'roseneid',\n", " 'subpena',\n", " 'nwhieh',\n", " 'mcmadaw',\n", " 'iyula',\n", " 'chengwen许成文',\n", " 'dhamala',\n", " 'нагорный',\n", " 'fulvigula',\n", " 'erntetanz',\n", " 'passmores',\n", " 'gallicarum',\n", " 'yfls',\n", " 'coalcompany',\n", " 'romatic',\n", " 'zobo',\n", " 'yfli',\n", " 'dadle',\n", " 'bsatroop',\n", " 'rcalixar',\n", " 'purumi',\n", " 'aescorting',\n", " 'audiovisuelles',\n", " 'brinkac',\n", " 'teritoriul',\n", " 'illdefinite',\n", " 'finanze',\n", " 'teasppon',\n", " 'novopen',\n", " 'ramasseurs',\n", " 'amirdara',\n", " 'mulanax',\n", " 'shulzes',\n", " 'sleying',\n", " 'mclliocls',\n", " 'qugmentation',\n", " 'berggeschichten',\n", " 'imint',\n", " 'stlein',\n", " 'distiehlis',\n", " 'resoarch',\n", " 'albigenses',\n", " 'rajasingha',\n", " 'ungoya',\n", " 'inkunabel',\n", " 'stactites',\n", " 'contillually',\n", " 'evenutally',\n", " 'deformidad',\n", " 'le__',\n", " 'fullchearings',\n", " 'gardnevi',\n", " 'obliquinity',\n", " 'charduar',\n", " 'stiiclic',\n", " 'engelska',\n", " 'jakins',\n", " 'huach',\n", " 'engelskt',\n", " 'descoeudres',\n", " 'barcal',\n", " 'seltlo',\n", " 'generalhead',\n", " 'wasdetermined',\n", " 'stratoii',\n", " 'asentaran',\n", " 'nationagl',\n", " 'raypublican',\n", " 'bindernagel',\n", " 'rudal',\n", " 'venkatachalapathy',\n", " 'angioletti',\n", " 'tamtamspeler',\n", " 'glstn',\n", " 'gansarski',\n", " 'remijius',\n", " 'ilune',\n", " 'conontoxin',\n", " 'lreling',\n", " 'alfrlfr',\n", " 'morrili',\n", " 'snouted',\n", " 'incurabli',\n", " 'deparptment',\n", " 'deviacion',\n", " 'glockebaum',\n", " 'connitlons',\n", " 'potö',\n", " 'forestell',\n", " 'glayd',\n", " 'gevaerlijcke',\n", " 'glays',\n", " 'meresman',\n", " 'januam',\n", " 'other_so_',\n", " 'ongeo',\n", " 'irmscher',\n", " 'clitioiis',\n", " 'rinserrano',\n", " 'hylenaea',\n", " 'yonathan',\n", " 'pinlc',\n", " 'outwith',\n", " 'iinnssppiriartiaotniiostns',\n", " 'arisoptera',\n", " 'ensnare',\n", " 'schallbildung',\n", " 'ranexa',\n", " 'olsenm',\n", " 'duchesnoy',\n", " 'bowlingtown',\n", " 'laumsa',\n", " 'remport',\n", " 'serting',\n", " 'estandarizadas',\n", " 'ivac',\n", " 'depraedation',\n", " 'tiongco',\n", " 'shahnovich',\n", " 'chausses',\n", " 'emisije',\n", " 'peyritsch',\n", " 'kaake',\n", " 'affligeante',\n", " 'brukes',\n", " 'bruker',\n", " 'ontstanding',\n", " 'mpraéso',\n", " ...]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "texthaps" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "biodiv | 36351\n", "rumsey | 11564\n", "commonwealth | 17291\n", "georgia | 19225\n", "harvard | 10822\n", "ia | 82685\n", "getty | 8896\n", "kentucky | 10656\n", "minnesota | 14554\n", "missouri | 17175\n", "mwdl | 161243\n", "nara | 42642\n", "nocar | 30677\n", "smiths | 96110\n", "socar | 18369\n", "texas | 23045\n", "gpo | 27582\n", "illinois | 19754\n", "usc | 70684\n", "virginia | 6878\n", "nocoll | 995\n", "artstor | 392117\n" ] } ], "source": [ "colltexthaps = {}\n", "colls2 = colls\n", "colls2.append('artstor')\n", "\n", "for coll in colls2:\n", " colltexthaps[coll] = []\n", " for hap in fds[coll].hapaxes():\n", " if not hasNumbers(hap):\n", " #if hap in texthaps:\n", " colltexthaps[coll].append(hap)\n", " print(coll, \"|\", str(len(colltexthaps[coll])))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.3.5" } }, "nbformat": 4, "nbformat_minor": 0 }