{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# sklearn-LDA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "代码示例:https://mp.weixin.qq.com/s/hMcJtB3Lss1NBalXRTGZlQ (玉树芝兰)
\n", "可视化:https://blog.csdn.net/qq_39496504/article/details/107125284
\n", "sklearn lda参数解读:https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html\n", "
中文版参数解读:https://blog.csdn.net/TiffanyRabbit/article/details/76445909\n", "
LDA原理-视频版:https://www.bilibili.com/video/BV1t54y127U8\n", "
LDA原理-文字版:https://www.jianshu.com/p/5c510694c07e\n", "
score的计算方法:https://github.com/scikit-learn/scikit-learn/blob/844b4be24d20fc42cc13b957374c718956a0db39/sklearn/decomposition/_lda.py#L729\n", "
主题困惑度1:https://blog.csdn.net/weixin_43343486/article/details/109255165\n", "
主题困惑度2:https://blog.csdn.net/weixin_39676021/article/details/112187210" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.预处理" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import re\n", "import jieba\n", "import jieba.posseg as psg" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "output_path = r'G:\\Desktop\\social\\BigHomework\\result'\n", "file_path = r'G:\\Desktop\\AI\\LDA\\weibo_data\\weibo_data_preprocessed.csv'\n", "data=pd.read_csv(file_path).astype(str)#content type\n", "stop_file = r\"G:\\Desktop\\AI\\LDA\\data\\stop_words.txt\"" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# 中文分词,使用jieba分词,只保留名词/动名词\n", "def chinese_word_cut(mytext):\n", " try:\n", " stopword_list = open(stop_file,encoding ='utf-8')\n", " except:\n", " stopword_list = []\n", " print(\"error in stop_file\")\n", " stop_list = []\n", " flag_list = ['n','nz','vn']\n", " for line in stopword_list:\n", " line = re.sub(u'\\n|\\\\r', '', line)\n", " stop_list.append(line)\n", " \n", " word_list = []\n", " #jieba分词\n", " seg_list = psg.cut(mytext)\n", " for seg_word in seg_list:\n", " # 只保留中文\n", " word = re.sub(u'[^\\u4e00-\\u9fa5]','',seg_word.word)\n", " find = 0\n", " for stop_word in stop_list:\n", " if stop_word == word or len(word)<2: #this word is stopword\n", " find = 1\n", " break\n", " if find == 0 and seg_word.flag in flag_list:\n", " word_list.append(word) \n", " return (\" \").join(word_list)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
博主昵称微博认证content发布时间转发评论
0辣子鸡谁做的好吃nan好像可以在自己幻想的元宇宙里过一辈子好像已经过完了一辈子双鱼座的脑子要不得01月10日 23:59000
1远古的刀nan反正闭关锁宇宙我们将会面临下一次的闭关锁国融入不了全球经济王峻涛6688跟你们讲我相信这个元...01月10日 23:583230
2暮景烟_深浅nan周深先生之夜元宇宙周深拥有了生米就像拥有了梦的翅膀卡布叻_周深放心飞吧生米永相随cp时尚先生...01月10日 23:58000
3东辉毅恒传媒nan王峻涛6688其实吧你有空可以再看看这个视频跟你们讲我相信这个元宇宙真的会来虽然不是一下子就...01月10日 23:57000
4在寒蝉鸣泣中等待夏日重现nan敬元宇宙L让基尔希斯坦的女朋友的微博视频01月10日 23:57005
\n", "
" ], "text/plain": [ " 博主昵称 微博认证 content \\\n", "0 辣子鸡谁做的好吃 nan 好像可以在自己幻想的元宇宙里过一辈子好像已经过完了一辈子双鱼座的脑子要不得 \n", "1 远古的刀 nan 反正闭关锁宇宙我们将会面临下一次的闭关锁国融入不了全球经济王峻涛6688跟你们讲我相信这个元... \n", "2 暮景烟_深浅 nan 周深先生之夜元宇宙周深拥有了生米就像拥有了梦的翅膀卡布叻_周深放心飞吧生米永相随cp时尚先生... \n", "3 东辉毅恒传媒 nan 王峻涛6688其实吧你有空可以再看看这个视频跟你们讲我相信这个元宇宙真的会来虽然不是一下子就... \n", "4 在寒蝉鸣泣中等待夏日重现 nan 敬元宇宙L让基尔希斯坦的女朋友的微博视频 \n", "\n", " 发布时间 转发 评论 赞 \n", "0 01月10日 23:59 0 0 0 \n", "1 01月10日 23:58 32 3 0 \n", "2 01月10日 23:58 0 0 0 \n", "3 01月10日 23:57 0 0 0 \n", "4 01月10日 23:57 0 0 5 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "data[\"content\"] = data.content.apply(chinese_word_cut)\n", "data.to_csv('weibi_data_keywords.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'data' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mg:\\Desktop\\AI\\LDA\\LDAsklearn_origin.ipynb Cell 9'\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m data\u001b[39m.\u001b[39mhead()\n", "\u001b[1;31mNameError\u001b[0m: name 'data' is not defined" ] } ], "source": [ "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.LDA分析" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", "from sklearn.decomposition import LatentDirichletAllocation\n", "import os\n", "import pandas as pd\n", "import re\n", "import jieba\n", "import jieba.posseg as psg\n", "\n", "data = pd.read_csv(r'G:\\Desktop\\AI\\LDA\\weibi_data_keywords.csv').astype(str)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def print_top_words(model, feature_names, n_top_words):\n", " tword = []\n", " for topic_idx, topic in enumerate(model.components_):\n", " print(\"Topic #%d:\" % topic_idx)\n", " topic_w = \" \".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])\n", " tword.append(topic_w)\n", " print(topic_w)\n", " return tword" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "n_features = 1000 #提取1000个特征词语\n", "tf_vectorizer = CountVectorizer(strip_accents = 'unicode',\n", " max_features=n_features,\n", " stop_words='english',\n", " max_df = 0.5,\n", " min_df = 10)\n", "tf = tf_vectorizer.fit_transform(data.content)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "n_features = 1000\n", "tf_vectorizer1 = TfidfVectorizer(strip_accents = 'unicode',\n", " max_features=n_features,\n", " stop_words='english',\n", " max_df = 0.5,\n", " min_df = 10)\n", "tf1 = tf_vectorizer1.fit_transform(data.content)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LatentDirichletAllocation(learning_offset=50, max_iter=100, n_components=5,\n", " random_state=0)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n_topics = 5\n", "lda1 = LatentDirichletAllocation(n_components=n_topics, max_iter=100,\n", " learning_method='batch',\n", " learning_offset=50,\n", "# doc_topic_prior=0.1,\n", "# topic_word_prior=0.01,\n", " random_state=0)\n", "lda1.fit(tf1)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'lda1' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mg:\\Desktop\\AI\\LDA\\LDAsklearn_origin.ipynb Cell 16'\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m lda1\n", "\u001b[1;31mNameError\u001b[0m: name 'lda1' is not defined" ] } ], "source": [ "lda1" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LatentDirichletAllocation(learning_offset=50, max_iter=50, n_components=8,\n", " random_state=0)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n_topics = 8\n", "lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,\n", " learning_method='batch',\n", " learning_offset=50,\n", "# doc_topic_prior=0.1,\n", "# topic_word_prior=0.01,\n", " random_state=0)\n", "lda.fit(tf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1输出每个主题对应词语 " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'tf_vectorizer' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mg:\\Desktop\\AI\\LDA\\LDAsklearn_origin.ipynb Cell 19'\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m n_top_words \u001b[39m=\u001b[39m \u001b[39m25\u001b[39m\n\u001b[1;32m----> 2\u001b[0m tf_feature_names \u001b[39m=\u001b[39m tf_vectorizer\u001b[39m.\u001b[39mget_feature_names()\n\u001b[0;32m 3\u001b[0m topic_word \u001b[39m=\u001b[39m print_top_words(lda, tf_feature_names, n_top_words)\n\u001b[0;32m 4\u001b[0m tf_feature_names1 \u001b[39m=\u001b[39m tf_vectorizer1\u001b[39m.\u001b[39mget_feature_names()\n", "\u001b[1;31mNameError\u001b[0m: name 'tf_vectorizer' is not defined" ] } ], "source": [ "n_top_words = 25\n", "tf_feature_names = tf_vectorizer.get_feature_names()\n", "topic_word = print_top_words(lda, tf_feature_names, n_top_words)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic #0:\n", "世界 数字 藏品 人类 艺术 回家 视频 现实 生活 星球 时代 虚拟世界 老师 电影 时间 作品 图片 感觉 代言人 空间 科幻 科技 体验 玩家 概念\n", "Topic #1:\n", "区块 游戏 手游 项目 社区 以太 货币 币圈 链元 链接 网页 链游 合约 海盗 财经 狗狗 上线 交易 视频 数字 土地 计划 生态 代币 行情\n", "Topic #2:\n", "板块 股份 市场 个股 资金 概念 指数 股票 龙头 机会 行情 新能源 方向 题材 科技 大盘 调整 文章 趋势 医药 热点 预期 走势 股市 创业板\n", "Topic #3:\n", "链接 网页 时尚 抽奖 话题 平台 粉丝 感觉 概念 官方 新品 评论 舞台 和元 朋友 发布会 体验 玩法 商标 音乐 直播 星辰 小伙伴 女王 口罩\n", "Topic #4:\n", "公司 视频 技术 科技 发展 互联网 游戏 概念 数字 产品 投资 产业 领域 行业 经济 企业 平台 社交 世界 全球 腾讯 新闻 布局 现实 内容\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\31897\\anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", " warnings.warn(msg, category=FutureWarning)\n" ] } ], "source": [ "n_top_words = 25\n", "tf_feature_names1 = tf_vectorizer1.get_feature_names()\n", "topic_word = print_top_words(lda1, tf_feature_names1, n_top_words)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.2输出每篇文章对应主题 " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "topics=lda1.transform(tf1)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'lda' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mg:\\Desktop\\AI\\LDA\\LDAsklearn_origin.ipynb Cell 24'\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m lda\n", "\u001b[1;31mNameError\u001b[0m: name 'lda' is not defined" ] } ], "source": [ "lda" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "topic = []\n", "for t in topics:\n", " topic.append(\"Topic #\"+str(list(t).index(np.max(t))))\n", "data['概率最大的主题序号']=topic\n", "data['每个主题对应概率']=list(topics)\n", "data.to_excel(\"data_topic_tfidf.xlsx\",index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.3可视化 " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import pyLDAvis\n", "import pyLDAvis.sklearn" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\31897\\anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", " warnings.warn(msg, category=FutureWarning)\n", "c:\\Users\\31897\\anaconda3\\lib\\site-packages\\pyLDAvis\\_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n", " default_term_info = default_term_info.sort_values(\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyLDAvis.enable_notebook()\n", "pic = pyLDAvis.sklearn.prepare(lda1, tf1, tf_vectorizer1)\n", "pyLDAvis.display(pic)\n", "pyLDAvis.save_html(pic, 'lda_pass'+str(n_topics)+'.html')\n", "pyLDAvis.display(pic)\n", "#去工作路径下找保存好的html文件\n", "#和视频里讲的不一样,目前这个代码不需要手动中断运行,可以快速出结果" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.4困惑度 " ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n" ] } ], "source": [ "plexs = []\n", "scores = []\n", "n_max_topics = 16\n", "for i in range(1,n_max_topics):\n", " print(i)\n", " lda2 = LatentDirichletAllocation(n_components=i, max_iter=50,\n", " learning_method='batch',\n", " learning_offset=50,random_state=0)\n", " lda2.fit(tf1)\n", " plexs.append(lda2.perplexity(tf1))\n", " scores.append(lda2.score(tf1))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "n_t=15#区间最右侧的值。注意:不能大于n_max_topics\n", "x=list(range(1,n_t+1))\n", "plt.plot(x,plexs[0:n_t])\n", "plt.xlabel(\"number of topics\")\n", "plt.ylabel(\"perplexity\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "8ad04bb091a50ccdbc721be5657843c2a9a7fbc3dc3147e11dd611d30683dae8" }, "kernelspec": { "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "303.324px", "left": "114px", "top": "110.322px", "width": "165px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }