{ "metadata": { "name": "", "signature": "sha256:fc0c8f3b3173c4278e931fed74bc09733db862b0d10f7734fe797f7050c1d539" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import json\n", "import glob\n", "import collections\n", "import os\n", "\n", "docs_number = 30000\n", "\n", "path_to_authors = 'C:\\\\ML_Strijov\\\\Habr_classification\\\\authors.txt'\n", "\n", "authors = dict()\n", "author_id = 1\n", "\n", "for page_id in xrange(docs_number):\n", " filename = 'C:\\\\ML_Strijov\\\\Habr_classification\\\\habr_pages\\\\%d' % page_id\n", " if not os.path.exists(filename):\n", " continue\n", "\n", " with open(filename) as f:\n", " post = json.load(f)\n", " if post['author'] not in authors.keys():\n", " authors[post['author']] = author_id\n", " author_id += 1\n", " \n", "print (\"List of authors is formed\")\n", " \n", "with open(path_to_authors, 'wb') as out:\n", " for page_id in xrange(docs_number):\n", "\n", " filename = 'C:\\\\ML_Strijov\\\\Habr_classification\\\\habr_pages\\\\%d' % page_id\n", " if not os.path.exists(filename):\n", " continue\n", "\n", " with open(filename) as f:\n", " post = json.load(f)\n", "\n", " page_author = post['author']\n", "\n", " print>>out, authors[page_author], page_id" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "List of authors is formed\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "import codecs\n", "import os\n", "import re\n", "import json\n", "import operator\n", "from collections import defaultdict\n", "\n", "topic_count = 10\n", "\n", "author_path = 'C:\\\\ML_Strijov\\\\Habr_classification\\\\habr_code\\\\authors.txt'\n", "theta_path = 'C:\\\\ML_Strijov\\\\Habr_classification\\\\habr_code\\\\theta.txt'\n", "author_topic_path = 'C:\\\\ML_Strijov\\\\Habr_classification\\\\habr_code\\\\author_topic.txt'\n", "mrec_data_path = 'C:\\\\ML_Strijov\\\\Habr_classification\\\\habr_code\\\\author_recomendations_2.tsv'\n", "\n", "authors = defaultdict(defaultdict)\n", "defaultdict_float = lambda: defaultdict(float)\n", "author_topic_distribution = defaultdict(defaultdict_float)\n", "author_doc_evaluation = defaultdict(defaultdict_float)\n", "author_list = []\n", "\n", "\n", "with open(author_path, 'r') as authors_file:\n", " for index, line in enumerate(authors_file):\n", " splitted = line.split()\n", " author = int(splitted[0])\n", " doc_id = int(splitted[1])\n", " authors[doc_id][author] = 1\n", " author_list.append(author)\n", " \n", "with open(theta_path, 'r') as theta:\n", " for index, line in enumerate(theta):\n", " splitted = line.split()\n", "\n", " if index > 0 and index % 1000 == 0:\n", " print 'Processed {0} documents'.format(index)\n", "\n", " doc_id = int(splitted[0])\n", " distribution = tuple(map(float, splitted[1:]))\n", "\n", " for author in authors[doc_id]:\n", " for topic in range(topic_count):\n", " author_topic_distribution[author][topic] += distribution[topic] * authors[doc_id][author]\n", " \n", "\n", "print 'Mrec data counting...' \n", "\n", "with open(mrec_data_path, 'w') as mrec_data_out: \n", " with open(theta_path, 'r') as theta:\n", " for index, line in enumerate(theta):\n", " splitted = line.split()\n", "\n", " if index > 0 and index % 10000 == 0:\n", " print 'Processed {0} documents'.format(index)\n", "\n", " doc_id = int(splitted[0])\n", " distribution = tuple(map(float, splitted[1:]))\n", "\n", " for author in author_list[:100]:\n", " for topic in range(topic_count):\n", " evaluation = distribution[topic] * author_topic_distribution[author][topic] * 100.0\n", " if evaluation > 1:\n", " print>>mrec_data_out, str(author)+'\\t'+str(doc_id)+'\\t'+str(evaluation)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Processed 1000 documents\n", "Processed 2000 documents\n", "Processed 3000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 4000 documents\n", "Processed 5000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 6000 documents\n", "Processed 7000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 8000 documents\n", "Processed 9000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 10000 documents\n", "Processed 11000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 12000 documents\n", "Processed 13000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 14000 documents\n", "Processed 15000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 16000 documents\n", "Processed 17000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 18000 documents\n", "Processed 19000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 20000 documents\n", "Processed 21000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 22000 documents\n", "Processed 23000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 24000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 25000 documents\n", "Processed 26000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 27000 documents\n", "Processed 28000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 29000 documents\n", "Processed 30000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 31000 documents\n", "Processed 32000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 33000 documents\n", "Processed 34000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 35000 documents\n", "Processed 36000 documents\n", "Processed 37000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 38000 documents\n", "Processed 39000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 40000 documents\n", "Processed 41000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 42000 documents\n", "Processed 43000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 44000 documents\n", "Processed 45000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 46000 documents\n", "Processed 47000 documents\n", "Processed 48000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 49000 documents\n", "Processed 50000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Mrec data counting...\n", "Processed 10000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 20000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 30000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 40000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Processed 50000 documents" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }