{ "cells": [ { "cell_type": "code", "execution_count": 98, "metadata": { "collapsed": true }, "outputs": [], "source": [ "docword_file = '/home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/docword.nips.txt'\n", "dict_file = '/home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/vocab.nips.txt'" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./bigartm --corpus-format bow --read-corpus /home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/docword.nips.txt --use-dictionary-bow /home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/vocab.nips.txt --topics 10 --update-every 1 --passes 10 --kappa 0.5 --tau0 64\n", "libartm.so: cannot open shared object file: No such file or directory, fall back to ARTM_SHARED_LIBRARY environment variable\n", "Create temporary batch folder: /tmp/tmpMuNvzM\n", "Parse collection: /home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/docword.nips.txt -> /tmp/tmpMuNvzM, batch_size=1000\n", "Initialize model\n", "W0612 18:59:26.353670 12134 merger.cc:300] SynchronizeModel() did not found any increments to topic model urn:uuid:ffb0c1f2-111b-11e5-96b3-fa163e8d9532\n", "processed 1000 items, perplexity = 12185.621588\n", "processed 2500 items, perplexity = 1248.822991\n", "processed 4000 items, perplexity = 1617.799503\n", "processed 5500 items, perplexity = 1802.688212\n", "processed 7000 items, perplexity = 1907.541042\n", "processed 8500 items, perplexity = 1971.545763\n", "processed 10000 items, perplexity = 2010.787087\n", "processed 11500 items, perplexity = 2034.872774\n", "processed 13000 items, perplexity = 2049.129378\n", "processed 14500 items, perplexity = 2056.932210\n", "processed 15000 items, perplexity = 2057.239507\n", "Top tokens per topic:\n", "Topic#1: learning (0.022) network (0.014) input (0.010) set (0.010) neural (0.009) weight (0.008) algorithm (0.008) model (0.007) task (0.007) function (0.006) \n", "Topic#2: network (0.015) weight (0.012) system (0.011) neural (0.009) neuron (0.009) output (0.009) learning (0.008) current (0.007) circuit (0.006) input (0.006) \n", "Topic#3: network (0.015) unit (0.010) output (0.008) point (0.008) problem (0.008) function (0.007) images (0.007) image (0.006) learning (0.006) layer (0.006) \n", "Topic#4: model (0.018) neuron (0.015) network (0.013) input (0.010) cell (0.010) system (0.008) unit (0.006) neural (0.006) pattern (0.006) synaptic (0.006) \n", "Topic#5: network (0.018) data (0.011) system (0.009) model (0.008) algorithm (0.007) word (0.006) parameter (0.006) component (0.005) linear (0.005) term (0.005) \n", "Topic#6: data (0.009) neural (0.009) number (0.006) set (0.006) model (0.006) motion (0.006) result (0.005) pattern (0.005) system (0.005) rate (0.005) \n", "Topic#7: function (0.021) algorithm (0.016) network (0.014) training (0.010) error (0.010) learning (0.009) neural (0.009) weight (0.008) number (0.008) result (0.007) \n", "Topic#8: network (0.033) input (0.026) set (0.013) unit (0.012) function (0.012) output (0.010) learning (0.009) system (0.009) training (0.008) algorithm (0.007) \n", "Topic#9: model (0.022) learning (0.016) set (0.011) data (0.010) problem (0.009) method (0.008) function (0.007) error (0.006) distribution (0.006) step (0.006) \n", "Topic#10: model (0.011) training (0.010) neural (0.010) network (0.010) input (0.009) set (0.009) error (0.008) output (0.008) function (0.007) learning (0.007) \n" ] } ], "source": [ "!./bigartm --corpus-format bow --read-corpus {docword_file} --use-dictionary-bow {dict_file} --topics 10 --update-every 1 --passes 10 --kappa 0.5 --tau0 64" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "collapsed": true }, "outputs": [], "source": [ "docword_file = '/home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/docword.enron.txt'\n", "dict_file = '/home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/vocab.enron.txt'" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./bigartm --corpus-format bow --read-corpus /home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/docword.enron.txt --use-dictionary-bow /home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/vocab.enron.txt --batch-size 30000 --topics 10 --update-every 1 --passes 10 --kappa 0.5 --tau0 64\n", "libartm.so: cannot open shared object file: No such file or directory, fall back to ARTM_SHARED_LIBRARY environment variable\n", "Create temporary batch folder: /tmp/tmpAFXQJ6\n", "Parse collection: /home/romovpa/notebooks/bigartm-book/applications/uci_bow/data/docword.enron.txt -> /tmp/tmpAFXQJ6, batch_size=30000\n", "Initialize model\n", "processed 39861 items, perplexity = 27412.710768\n", "processed 79722 items, perplexity = 11517.766680\n", "processed 119583 items, perplexity = 8602.245303\n", "processed 159444 items, perplexity = 7386.092703\n", "processed 199305 items, perplexity = 6694.426605\n", "processed 239166 items, perplexity = 6237.597039\n", "processed 279027 items, perplexity = 5904.315310\n", "processed 318888 items, perplexity = 5645.505335\n", "processed 358749 items, perplexity = 5435.339423\n", "processed 398610 items, perplexity = 5259.309476\n", "processed 398610 items, perplexity = 5259.309476\n", "Top tokens per topic:\n", "Topic#1: meeting (0.006) california (0.006) office (0.006) group (0.004) team (0.004) energy (0.004) deal (0.004) issues (0.004) point (0.003) market (0.003) \n", "Topic#2: power (0.013) california (0.011) energy (0.009) electricity (0.006) utility (0.006) contract (0.005) prices (0.005) corp (0.005) states (0.005) plan (0.005) \n", "Topic#3: energy (0.010) customer (0.009) market (0.008) order (0.006) access (0.005) ferc (0.004) message (0.004) page (0.004) program (0.004) service (0.004) \n", "Topic#4: meeting (0.006) free (0.004) energy (0.004) help (0.004) game (0.004) number (0.004) against (0.004) updated (0.003) going (0.003) look (0.003) \n", "Topic#5: power (0.008) california (0.007) cost (0.007) attached (0.006) meeting (0.005) comment (0.005) davis (0.005) bill (0.005) rates (0.004) electricity (0.004) \n", "Topic#6: company (0.010) energy (0.007) business (0.007) gas (0.006) market (0.006) stock (0.005) companies (0.005) houston (0.004) power (0.004) investment (0.004) \n", "Topic#7: company (0.010) market (0.007) price (0.006) power (0.005) california (0.005) database (0.005) gas (0.004) customer (0.004) contract (0.004) operation (0.004) \n", "Topic#8: company (0.016) firm (0.007) services (0.005) power (0.005) business (0.005) fund (0.005) financial (0.005) technology (0.004) investor (0.004) agreement (0.004) \n", "Topic#9: texas (0.007) team (0.007) top (0.004) plan (0.004) list (0.004) play (0.003) longhorn (0.003) energy (0.003) power (0.003) cost (0.003) \n", "Topic#10: final (0.007) company (0.007) market (0.005) report (0.005) hour (0.005) offer (0.005) price (0.005) investment (0.005) deal (0.005) companies (0.005) \n" ] } ], "source": [ "!./bigartm --corpus-format bow --read-corpus {docword_file} --use-dictionary-bow {dict_file} --batch-size 30000 --topics 10 --update-every 1 --passes 10 --kappa 0.5 --tau0 64" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.3" } }, "nbformat": 4, "nbformat_minor": 0 }