{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Research2VecTraining2.ipynb", "version": "0.3.2", "provenance": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "[View in Colaboratory](https://colab.research.google.com/github/Santosh-Gupta/Research2Vec/blob/master/Research2VecTraining2.ipynb)" ] }, { "metadata": { "id": "cz8FsZRH99zd", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "import math\n", "import numpy as np\n", "import random\n", "import zipfile\n", "import os\n", "import tensorflow as tf\n", "import pandas as pd\n", "import pickle\n", "\n", "!pip install -U -q PyDrive\n", "\n", "from google.colab import files\n", "from pydrive.auth import GoogleAuth\n", "from pydrive.drive import GoogleDrive\n", "from google.colab import auth\n", "from oauth2client.client import GoogleCredentials\n", "\n", "from numpy import genfromtxt\n", "\n", "auth.authenticate_user()\n", "gauth = GoogleAuth()\n", "gauth.credentials = GoogleCredentials.get_application_default()\n", "drive = GoogleDrive(gauth)\n", "\n", "vocabulary_size = 1666577 \n", "\n", "# tf.logging.set_verbosity(tf.logging.ERROR)" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "61iA1a4z_-nb", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "os.remove('adc.json')" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "nW9LJi9FshOE", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "outputId": "e18b1cf7-11eb-47e6-e709-cc379a2e6a19" }, "cell_type": "code", "source": [ "dl_id = input(\"Enter Gdrive file ID for Title dictionary: \") # 9-10-18 1mdF0JfrzbOxeHD26JaUW8KEfIdMEHzQG\n", "\n", "thefile = drive.CreateFile({'id': dl_id})\n", "thefile.GetContentFile('titleDict.pickle')\n", "\n", "with open('titleDict.pickle', 'rb') as handle:\n", " bookDictionary = pickle.load(handle)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Enter Gdrive file ID for Title dictionary: 1mdF0JfrzbOxeHD26JaUW8KEfIdMEHzQG\n" ], "name": "stdout" } ] }, { "metadata": { "id": "BBc9exLkwBdm", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 486 }, "outputId": "103e7d6a-526d-44d6-b0e9-91faf1bf04f4" }, "cell_type": "code", "source": [ "dl_id = input(\"Enter Gdrive file ID for Data \") # 9-10-18 1RHVwT1slwbhPlNTTF1JKS7agc4hQSm5c\n", "\n", "myDownload = drive.CreateFile({'id': dl_id})\n", "myDownload.GetContentFile('Data.npy')\n", "my_data = np.load('Data.npy')\n", "print(my_data[0:15])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Enter Gdrive file ID for Data 1RHVwT1slwbhPlNTTF1JKS7agc4hQSm5c\n", "[[0 list([])]\n", " [1\n", " list([421089, 510776, 6403, 1554618, 1451018, 1448326, 1411539, 734702, 263668, 276186, 374145, 712335, 1540518, 732154, 1256014, 370711])]\n", " [2 list([])]\n", " [3\n", " list([896236, 552833, 290985, 744122, 660888, 1492583, 324439, 1497464, 906952, 890270, 800459, 656974, 464637, 432398, 672494, 1501784, 1551199, 169770, 880915, 1257202, 1647789, 431318, 167368, 1309706, 645636, 1589247, 952101, 1594224, 566783, 1020670, 1530466, 572983, 393055, 923629, 1349376, 455838, 168364, 1419708, 670762, 64953])]\n", " [4\n", " list([377701, 646875, 1527223, 458740, 1022675, 668690, 910689, 951671, 717587, 1655779, 670477, 66465, 374116, 450320, 83567, 863721, 1328431, 1585189, 1439964])]\n", " [5\n", " list([1394328, 658435, 1338541, 1024419, 1193128, 1416126, 600891, 1133836, 1502110, 38954, 200361, 1271103, 914246, 580300, 337729, 316423, 1631441, 75283, 153695, 294419, 904711, 234803, 341096, 350848, 344889, 146171, 610828, 475984, 462863, 768574, 1060750, 753854, 355396, 457861, 1159063, 1074007, 919943, 1045192, 550452])]\n", " [6 list([1590967])]\n", " [7\n", " list([731413, 371576, 101514, 291861, 668641, 812990, 457315, 1428604, 216222, 313539, 475783, 1384755, 1426847, 1612089, 124271, 1259377, 1209643, 994466, 1437081, 300318, 1432000])]\n", " [8\n", " list([537201, 743717, 194785, 886957, 877387, 405472, 145841, 662184])]\n", " [9\n", " list([727615, 1033127, 1488761, 205826, 1278175, 1406008, 546451, 739509, 1412014, 1628720, 797494, 381440, 738525, 103954, 1293419, 778810, 292339, 906068])]\n", " [10\n", " list([767034, 632192, 943392, 1444320, 136613, 891973, 1497365, 1580850, 305850, 1000807, 1449216, 1476570, 301317, 1500249, 1262399, 501012, 1115942, 1058776, 1447436, 1357729, 1592057, 1498628, 1618410, 987861, 1504522])]\n", " [11 list([260217, 735748, 576441, 596114])]\n", " [12\n", " list([1139960, 983105, 295417, 557677, 1252174, 697297, 881771, 1211343, 863739, 714478, 1219226, 935298, 1043281, 1229931, 839873, 1153430, 1080857, 1654324, 137984, 1025220, 696853, 570840, 1590806, 1351588, 595129, 964004, 1472538, 239877])]\n", " [13 list([])]\n", " [14\n", " list([1255078, 525049, 1481675, 1231620, 894550, 127476, 384389, 737607, 1651253, 771448, 284807, 1127559, 213372, 687169, 1480889, 1395063, 369818, 1454291])]]\n" ], "name": "stdout" } ] }, { "metadata": { "id": "NE2HW6llJZtC", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 708 }, "outputId": "633fa965-ad55-4481-854f-045c5213fe61" }, "cell_type": "code", "source": [ "data_index = 0\n", "epoch_index = 0\n", "recEpoch_indexA = 0 #Used to help keep store of the total number of epoches with the models\n", "\n", "def generate_batch(batch_size, inputCount): #batch size = number of labels\n", " #inputCount = number of inputs per label\n", " global data_index, epoch_index\n", " \n", " batch = np.ndarray(shape=(batch_size, inputCount), dtype=np.int32) \n", " labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n", " \n", " n=0\n", " while n < batch_size:\n", " if len( set(my_data[data_index, 1]) ) >= inputCount:\n", " labels[n,0] = my_data[data_index, 0]\n", " batch[n] = random.sample( set(my_data[data_index, 1]), inputCount)\n", " n = n+1\n", " data_index = (data_index + 1) % len(my_data) #may have to do something like len my_data[:]\n", " if data_index == 0:\n", " epoch_index = epoch_index + 1\n", " print('Completed %d Epochs' % epoch_index)\n", " else:\n", " data_index = (data_index + 1) % len(my_data)\n", " if data_index == 0:\n", " epoch_index = epoch_index + 1\n", " print('Completed %d Epochs' % epoch_index)\n", " \n", " return batch, labels \n", " \n", "here, goes = generate_batch(20, 4) # to do next, insert %len(headernumber)\n", "print('batch', here)\n", "print('labels', goes)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "batch [[1540518 712335 276186 510776]\n", " [1020670 890270 656974 167368]\n", " [ 83567 377701 910689 646875]\n", " [1271103 75283 344889 475984]\n", " [1384755 313539 994466 457315]\n", " [ 145841 877387 194785 886957]\n", " [1412014 1628720 797494 1278175]\n", " [1476570 1449216 1357729 1592057]\n", " [ 260217 735748 596114 576441]\n", " [1252174 1043281 935298 570840]\n", " [ 284807 1127559 1231620 1395063]\n", " [ 425605 1199985 503766 1177226]\n", " [ 883387 1249697 1369264 1606440]\n", " [ 143641 823302 1020170 1480253]\n", " [ 633950 69171 905572 319694]\n", " [ 145147 1299803 1441307 381248]\n", " [ 391622 1203282 1594386 1482127]\n", " [ 205375 1665861 400547 692811]\n", " [ 531508 22134 760494 1454629]\n", " [1628449 474993 1129303 875062]]\n", "labels [[ 1]\n", " [ 3]\n", " [ 4]\n", " [ 5]\n", " [ 7]\n", " [ 8]\n", " [ 9]\n", " [10]\n", " [11]\n", " [12]\n", " [14]\n", " [15]\n", " [17]\n", " [18]\n", " [19]\n", " [20]\n", " [21]\n", " [22]\n", " [23]\n", " [24]]\n" ], "name": "stdout" } ] }, { "metadata": { "id": "EN8lEIKJxmRF", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "batch_size = 2048 #2^8\n", "\n", "embedding_size = 80 # 2^8 Dimension of the embedding vector.\n", "num_inputs =4\n", "\n", "num_sampled = 128 # Number of negative examples to sample.\n", "\n", "graph = tf.Graph()\n", "\n", "with graph.as_default(): #took out \" , tf.device('/cpu:0')\"\n", " \n", "\n", " train_dataset = tf.placeholder(tf.int32, shape=[batch_size, num_inputs ])\n", " train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n", "\n", "\n", " epochCount = tf.get_variable( 'epochCount', initializer= 0) #to store epoch count to total # of epochs are known\n", " update_epoch = tf.assign(epochCount, epochCount + 1)\n", "\n", " embeddings = tf.get_variable( 'embeddings', \n", " initializer= tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n", "\n", " softmax_weights = tf.get_variable( 'softmax_weights',\n", " initializer= tf.truncated_normal([vocabulary_size, embedding_size],\n", " stddev=1.0 / math.sqrt(embedding_size)))\n", " \n", " softmax_biases = tf.get_variable('softmax_biases', \n", " initializer= tf.zeros([vocabulary_size]), trainable=False )\n", "\n", " embed = tf.nn.embedding_lookup(embeddings, train_dataset) #train data set is\n", "\n", " embed_reshaped = tf.reshape( embed, [batch_size*num_inputs, embedding_size] )\n", " \n", " segments= np.arange(batch_size).repeat(num_inputs)\n", "\n", " averaged_embeds = tf.segment_mean(embed_reshaped, segments, name=None)\n", "\n", " loss = tf.reduce_mean(\n", " tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=averaged_embeds,\n", " labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))\n", "\n", " optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) #Original learning rate was 1.0\n", " \n", " saver = tf.train.Saver()" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "jcf12Y-4x8OL", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "def zipfolder(foldername, target_dir): \n", " zipobj = zipfile.ZipFile(foldername + '.zip', 'w', zipfile.ZIP_DEFLATED)\n", " rootlen = len(target_dir) + 1\n", " for base, dirs, files in os.walk(target_dir):\n", " for file in files:\n", " fn = os.path.join(base, file)\n", " zipobj.write(fn, fn[rootlen:])" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "aDigURkRF1TS", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 121 }, "outputId": "49a01611-542d-4443-bdc7-133361fd62be" }, "cell_type": "code", "source": [ "loadModel = input(\"Would you like to load a checkpoint? Type y or n: \") \n", "\n", "if loadModel == 'y':\n", " auth.authenticate_user()\n", " gauth = GoogleAuth()\n", " gauth.credentials = GoogleCredentials.get_application_default()\n", " drive = GoogleDrive(gauth)\n", " \n", " zip_id = input(\"Enter Gdrive file ID for tensorflow models: \") \n", "\n", " if not os.path.exists('checkpointsBook2VecCbowWindow1Downloaded'):\n", " os.makedirs('checkpointsBook2VecCbowWindow1Downloaded')\n", "\n", " # DOWNLOAD ZIP\n", " print (\"Downloading zip file\")\n", " myzip = drive.CreateFile({'id': zip_id})\n", " myzip.GetContentFile('model.zip')\n", "\n", " # UNZIP ZIP\n", " print (\"Uncompressing zip file\")\n", " zip_ref = zipfile.ZipFile('model.zip', 'r')\n", " zip_ref.extractall('checkpointsBook2VecCbowWindow1Downloaded/')\n", " zip_ref.close()\n", "\n", " print( os.getcwd() )\n", " print( os.listdir('./checkpointsBook2VecCbowWindow1Downloaded') )\n", " " ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Would you like to load a checkpoint? Type y or n: y\n", "Enter Gdrive file ID for tensorflow models: 14sVkBYW8SG9Rg9pjOE4KdO-8bwthgnFM\n", "Downloading zip file\n", "Uncompressing zip file\n", "/content\n", "['checkpoint', 'Research2VecEmbedSize80.ckpt.data-00000-of-00001', 'Research2VecEmbedSize80.ckpt.meta', 'Research2VecEmbedSize80.ckpt.index']\n" ], "name": "stdout" } ] }, { "metadata": { "id": "jn9Zvv4PxtXV", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 1485 }, "outputId": "1d3527b2-7768-460b-8c2d-9e3e37b41c58" }, "cell_type": "code", "source": [ "num_steps = 10000000\n", "\n", "if 'loadModel' not in locals() and 'loadModel' not in globals():\n", " loadModel = 'n'\n", "\n", "uploadModel = drive.CreateFile() #used to upload checkpoints when graph is run\n", "\n", "with tf.Session(graph=graph) as session:\n", " \n", " if loadModel == 'y':\n", " saver.restore(session, './checkpointsBook2VecCbowWindow1Downloaded/Research2VecEmbedSize80.ckpt' )\n", " else: \n", " tf.global_variables_initializer().run() #Don't initalize variables after a checkpoint has been restored\n", " \n", " print('Initialized')\n", " average_loss = 0\n", " saveIteration = 1\n", " for step in range(1, num_steps):\n", " \n", " batch_data, batch_labels = generate_batch(\n", " batch_size, num_inputs)\n", " feed_dict = {train_dataset : batch_data, train_labels : batch_labels}\n", " _, l = session.run([optimizer, loss], feed_dict=feed_dict) \n", "\n", " average_loss += l\n", " if step % 8000 == 0:\n", " if step > 0:\n", " average_loss = average_loss / 8000\n", " print('Average loss at step %d: %f' % (step, average_loss))\n", " average_loss = 0\n", " \n", " if step % 50000 == 0:\n", " recEpoch_indexA = epoch_index - recEpoch_indexA #how much did the epoch_index since it was last checked\n", " for nE in range(0, recEpoch_indexA ):\n", " session.run(update_epoch) #session run calls tend to be huge bottlenecks, keep in mind while determining the frequency\n", " recEpoch_indexA = epoch_index\n", " print('recEpoch_indexA is', recEpoch_indexA)\n", " print( 'epochCount.eval() is ', epochCount.eval() )\n", " print('epoch_index is ' , epoch_index)\n", " \n", " save_path = saver.save(session, \"checkpointsBook2Vec5Inputs/Research2VecEmbedSize80.ckpt\") #Save checkpoint\n", " \n", " auth.authenticate_user()\n", " gauth = GoogleAuth() #Gdrive authenticion code placed here since it expires after some time\n", " gauth.credentials = GoogleCredentials.get_application_default()\n", " drive = GoogleDrive(gauth) \n", " uploadModel = drive.CreateFile() #Need to also create drive object with updated authenticion\n", " \n", " chptName = 'Research2VecEmbedSize80'+str(saveIteration)\n", " zipfolder(chptName, 'checkpointsBook2Vec5Inputs')\n", " uploadModel.SetContentFile(chptName+\".zip\")\n", " uploadModel.Upload()\n", " \n", " print(\"Checkpoint uploaded to Google Drive\")\n", " saveIteration += 1\n", " os.remove(chptName+\".zip\") #Remove checkpoint zip file after upload\n", "\n", " \n" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:tensorflow:Restoring parameters from ./checkpointsBook2VecCbowWindow1Downloaded/Research2VecEmbedSize80.ckpt\n", "Initialized\n", "Completed 1 Epochs\n", "Completed 2 Epochs\n", "Completed 3 Epochs\n", "Completed 4 Epochs\n", "Completed 5 Epochs\n", "Completed 6 Epochs\n", "Completed 7 Epochs\n", "Completed 8 Epochs\n", "Completed 9 Epochs\n", "Completed 10 Epochs\n", "Completed 11 Epochs\n", "Average loss at step 8000: 0.015743\n", "Completed 12 Epochs\n", "Completed 13 Epochs\n", "Completed 14 Epochs\n", "Completed 15 Epochs\n", "Completed 16 Epochs\n", "Completed 17 Epochs\n", "Completed 18 Epochs\n", "Completed 19 Epochs\n", "Completed 20 Epochs\n", "Completed 21 Epochs\n", "Completed 22 Epochs\n", "Average loss at step 16000: 0.015661\n", "Completed 23 Epochs\n", "Completed 24 Epochs\n", "Completed 25 Epochs\n", "Completed 26 Epochs\n", "Completed 27 Epochs\n", "Completed 28 Epochs\n", "Completed 29 Epochs\n", "Completed 30 Epochs\n", "Completed 31 Epochs\n", "Completed 32 Epochs\n", "Completed 33 Epochs\n", "Average loss at step 24000: 0.015636\n", "Completed 34 Epochs\n", "Completed 35 Epochs\n", "Completed 36 Epochs\n", "Completed 37 Epochs\n", "Completed 38 Epochs\n", "Completed 39 Epochs\n", "Completed 40 Epochs\n", "Completed 41 Epochs\n", "Completed 42 Epochs\n", "Completed 43 Epochs\n", "Completed 44 Epochs\n", "Average loss at step 32000: 0.015627\n", "Completed 45 Epochs\n", "Completed 46 Epochs\n", "Completed 47 Epochs\n", "Completed 48 Epochs\n", "Completed 49 Epochs\n", "Completed 50 Epochs\n", "Completed 51 Epochs\n", "Completed 52 Epochs\n", "Completed 53 Epochs\n", "Completed 54 Epochs\n", "Completed 55 Epochs\n", "Average loss at step 40000: 0.015602\n", "Completed 56 Epochs\n", "Completed 57 Epochs\n", "Completed 58 Epochs\n", "Completed 59 Epochs\n", "Completed 60 Epochs\n", "Completed 61 Epochs\n", "Completed 62 Epochs\n", "Completed 63 Epochs\n", "Completed 64 Epochs\n", "Completed 65 Epochs\n", "Completed 66 Epochs\n", "Average loss at step 48000: 0.015633\n", "Completed 67 Epochs\n", "Completed 68 Epochs\n", "Completed 69 Epochs\n", "recEpoch_indexA is 69\n", "epochCount.eval() is 2012\n", "epoch_index is 69\n", "Checkpoint uploaded to Google Drive\n", "Completed 70 Epochs\n", "Completed 71 Epochs\n", "Completed 72 Epochs\n", "Completed 73 Epochs\n" ], "name": "stdout" } ] } ] }