{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Svmlight formatted file parsing with sklearn for Learning to Rank data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "\n", "from os.path import expanduser\n", "from sklearn.datasets import load_svmlight_file\n", "from sklearn.externals import joblib" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "memory = joblib.Memory(cachedir='.', mmap_mode='r')\n", "\n", "@memory.cache\n", "def load_fold(dataset, subset, fold_idx=1, dtype=np.float32):\n", " DATA_FOLDER = expanduser('~/data')\n", " filepath = join(DATA_FOLDER, dataset, 'Fold%d' % fold_idx, subset + '.txt')\n", " X, y, qid = load_svmlight_file(filepath, dtype=dtype, query_id=True)\n", " return X.toarray(), y, qid" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_train, y_train, qid_train = load_fold('MSLR-WEB10K','train', fold_idx=1)\n", "X_vali, y_vali, qid_vali = load_fold('MSLR-WEB10K', 'vali', fold_idx=1)\n", "X_test, y_test, qid_test = load_fold('MSLR-WEB10K', 'test', fold_idx=1)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%time\n", "\n", "np.savez_compressed(expanduser('~/data/MSLR-WEB10K/mslr_web10k_fold1.npz'),\n", " X_train=X_train, y_train=y_train, qid_train=qid_train,\n", " X_vali=X_vali, y_vali=y_vali, qid_vali=qid_vali,\n", " X_test=X_test, y_test=y_test, qid_test=qid_test)" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }