{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import h2o\n",
    "h2o.init()\n",
    "from h2o.estimators.word2vec import H2OWord2vecEstimator\n",
    "from h2o.estimators.gbm import H2OGradientBoostingEstimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "job_titles_path = \"https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv\"\n",
    "job_titles = h2o.import_file(job_titles_path, destination_frame = \"jobtitles\",\n",
    "                             col_names = [\"category\", \"jobtitle\"], col_types = [\"enum\", \"string\"], header = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "STOP_WORDS = [\"ax\",\"i\",\"you\",\"edu\",\"s\",\"t\",\"m\",\"subject\",\"can\",\"lines\",\"re\",\"what\",\n",
    "               \"there\",\"all\",\"we\",\"one\",\"the\",\"a\",\"an\",\"of\",\"or\",\"in\",\"for\",\"by\",\"on\",\n",
    "               \"but\",\"is\",\"in\",\"a\",\"not\",\"with\",\"as\",\"was\",\"if\",\"they\",\"are\",\"this\",\"and\",\"it\",\"have\",\n",
    "               \"from\",\"at\",\"my\",\"be\",\"by\",\"not\",\"that\",\"to\",\"from\",\"com\",\"org\",\"like\",\"likes\",\"so\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def tokenize(sentences, stop_word = STOP_WORDS):\n",
    "    tokenized = sentences.tokenize(\"\\\\W+\")\n",
    "    tokenized_lower = tokenized.tolower()\n",
    "    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]\n",
    "    tokenized_words = tokenized_filtered[tokenized_filtered.grep(\"[0-9]\",invert=True,output_logical=True),:]\n",
    "    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]\n",
    "    return tokenized_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def predict(job_title,w2v, gbm):\n",
    "    words = tokenize(h2o.H2OFrame(job_title).ascharacter())\n",
    "    job_title_vec = w2v.transform(words, aggregate_method=\"AVERAGE\")\n",
    "    print(gbm.predict(test_data=job_title_vec))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Break job titles into sequence of words\")\n",
    "words = tokenize(job_titles[\"jobtitle\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Build word2vec model\")\n",
    "w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)\n",
    "w2v_model.train(training_frame=words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Sanity check - find synonyms for the word 'teacher'\")\n",
    "w2v_model.find_synonyms(\"teacher\", count = 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Calculate a vector for each job title\")\n",
    "job_title_vecs = w2v_model.transform(words, aggregate_method = \"AVERAGE\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Prepare training&validation data (keep only job titles made of known words)\")\n",
    "valid_job_titles = ~ job_title_vecs[\"C1\"].isna()\n",
    "data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])\n",
    "data_split = data.split_frame(ratios=[0.8])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Build a basic GBM model\")\n",
    "gbm_model = H2OGradientBoostingEstimator()\n",
    "gbm_model.train(x = job_title_vecs.names,\n",
    "                y=\"category\", \n",
    "                training_frame = data_split[0], \n",
    "                validation_frame = data_split[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Predict!\")\n",
    "print(predict([\"school teacher having holidays every month\"], w2v_model, gbm_model))\n",
    "print(predict([\"developer with 3+ Java experience, jumping\"], w2v_model, gbm_model))\n",
    "print(predict([\"Financial accountant CPA preferred\"], w2v_model, gbm_model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}