{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import h2o\n", "h2o.init()\n", "from h2o.estimators.word2vec import H2OWord2vecEstimator\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "job_titles_path = \"https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv\"\n", "job_titles = h2o.import_file(job_titles_path, destination_frame = \"jobtitles\",\n", " col_names = [\"category\", \"jobtitle\"], col_types = [\"enum\", \"string\"], header = 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "STOP_WORDS = [\"ax\",\"i\",\"you\",\"edu\",\"s\",\"t\",\"m\",\"subject\",\"can\",\"lines\",\"re\",\"what\",\n", " \"there\",\"all\",\"we\",\"one\",\"the\",\"a\",\"an\",\"of\",\"or\",\"in\",\"for\",\"by\",\"on\",\n", " \"but\",\"is\",\"in\",\"a\",\"not\",\"with\",\"as\",\"was\",\"if\",\"they\",\"are\",\"this\",\"and\",\"it\",\"have\",\n", " \"from\",\"at\",\"my\",\"be\",\"by\",\"not\",\"that\",\"to\",\"from\",\"com\",\"org\",\"like\",\"likes\",\"so\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def tokenize(sentences, stop_word = STOP_WORDS):\n", " tokenized = sentences.tokenize(\"\\\\W+\")\n", " tokenized_lower = tokenized.tolower()\n", " tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]\n", " tokenized_words = tokenized_filtered[tokenized_filtered.grep(\"[0-9]\",invert=True,output_logical=True),:]\n", " tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]\n", " return tokenized_words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def predict(job_title,w2v, gbm):\n", " words = tokenize(h2o.H2OFrame(job_title).ascharacter())\n", " job_title_vec = w2v.transform(words, aggregate_method=\"AVERAGE\")\n", " print(gbm.predict(test_data=job_title_vec))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(\"Break job titles into sequence of words\")\n", "words = tokenize(job_titles[\"jobtitle\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(\"Build word2vec model\")\n", "w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)\n", "w2v_model.train(training_frame=words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(\"Sanity check - find synonyms for the word 'teacher'\")\n", "w2v_model.find_synonyms(\"teacher\", count = 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(\"Calculate a vector for each job title\")\n", "job_title_vecs = w2v_model.transform(words, aggregate_method = \"AVERAGE\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(\"Prepare training&validation data (keep only job titles made of known words)\")\n", "valid_job_titles = ~ job_title_vecs[\"C1\"].isna()\n", "data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])\n", "data_split = data.split_frame(ratios=[0.8])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(\"Build a basic GBM model\")\n", "gbm_model = H2OGradientBoostingEstimator()\n", "gbm_model.train(x = job_title_vecs.names,\n", " y=\"category\", \n", " training_frame = data_split[0], \n", " validation_frame = data_split[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(\"Predict!\")\n", "print(predict([\"school teacher having holidays every month\"], w2v_model, gbm_model))\n", "print(predict([\"developer with 3+ Java experience, jumping\"], w2v_model, gbm_model))\n", "print(predict([\"Financial accountant CPA preferred\"], w2v_model, gbm_model))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }