{ "cells": [ { "cell_type": "markdown", "source": [ "# 透過機器學習預測股市漲跌_系列2_進階資料處理\n", "\n", "作者:劉睿哲(臺灣行銷研究特邀作者)、蔡尚宏(臺灣行銷研究特邀作者)、鄭晴文(臺灣行銷研究特邀作者)\n", "\n", "[原始資料請見此](https://drive.google.com/file/d/15lnX0-B1SzXrSXEIVNMRfiA5bfK-Kfbq/view?usp=sharing),下載下來與本ipynb檔案放於同一個資料夾中,再執行下方程式即可。" ], "metadata": {} }, { "cell_type": "code", "execution_count": 28, "source": [ "import os\n", "import re\n", "import nltk\n", "import string\n", "import pandas as pd\n", "import numpy as np\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", "from datetime import timedelta, datetime\n" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 29, "source": [ "nltk.download('punkt')\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')" ], "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt to /Users/liuruizhe/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/liuruizhe/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] /Users/liuruizhe/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 29 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 47, "source": [ "data = pd.read_csv('/Users/liuruizhe/Library/Mobile Documents/com~apple~CloudDocs/TMR/Kaggle_DJIA/DJIA/after_Combined_News_DJIA.csv')\n", "data" ], "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0DateLabelTop1Top2Top3Top4Top5Top6Top7...Top19Top20Top21Top22Top23Top24Top25Unnamed: 28Unnamed: 29fluctuation
002008/8/80b georgia downs two russian warplanes as cou...b breaking musharraf to be impeachedb russia today columns of troops roll into so...b russian tanks are moving towards the capital...b afghan children raped with impunity u n o...b russian tanks have entered south ossetia wh...b breaking georgia invades south ossetia rus......b this is a busy day the european union has ...b georgia will withdraw soldiers from iraq t...b why the pentagon thinks attacking iran is a ...b caucasus in crisis georgia invades south os...b indian shoe manufactory and again in a se...b visitors suffering from mental illnesses ban...b no help for mexico s kidnapping surgeNaNNaN0.055658
112008/8/111b why wont america and nato help us if they w...b bush puts foot down on georgian conflictb jewish georgian minister thanks to israeli ...b georgian army flees in disarray as russians ...b olympic opening ceremony fireworks fakedb what were the mossad with fraudulent new zea...b russia angered by israeli military sale to g......b china to overtake us as largest manufacturerb war in south ossetia picsb israeli physicians group condemns state tort...b russia has just beaten the united states ov...b perhaps the question about the georgia r...b russia is so much better at warb so this is what it s come to trading sex fo...NaNNaN-1.793642
222008/8/120b remember that adorable year old who sang at...b russia ends georgia operationb if we had no sexual harassment we would have...b al qa eda is losing support in iraq because ...b ceasefire in georgia putin outmaneuvers the...b why microsoft and intel tried to kill the xo...b stratfor the russo georgian war and the bal......b russia georgia and nato cold war twob remember that adorable year old who led you...b war in georgia the israeli connectionb all signs point to the us encouraging georgi...b christopher king argues that the us and nato...b america the new mexicob bbc news asia pacific extinction by man...NaNNaN-0.757185
332008/8/130b u s refuses israel weapons to attack iran ...b when the president ordered to attack tskhinv...b israel clears troops who killed reuters cam...b britain s policy of being tough on drugs is ...b body of year old found in trunk latest r...b china has moved million quake survivors ...b bush announces operation get all up in russi......b russian convoy heads into georgia violating...b israeli defence minister us against strike ...b gorbachev we had no choiceb witness russian forces head towards tbilisi...b quarter of russians blame u s for conflict...b georgian president says us military will ta...b nobel laureate aleksander solzhenitsyn accu...NaNNaN1.207153
442008/8/141b all the experts admit that we should legalis...b war in south osetia pictures made by a r...b swedish wrestler ara abrahamian throws away ...b russia exaggerated the death toll in south o...b missile that killed inside pakistan may ha...b rushdie condemns random house s refusal to p...b poland and us agree to missle defense deal ......b non media photos of south ossetia georgia c...b georgian tv reporter shot by russian sniper ...b saudi arabia mother moves to block child ma...b taliban wages war on humanitarian aid workersb russia world can forget about georgia s ...b darfur rebels accuse sudan of mounting major...b philippines peace advocate say muslims nee...NaNNaN1.795697
..................................................................
198419842016/6/270barclays and rbs shares suspended from trading...pope says church should ask forgiveness from g...poland shocked by xenophobic abuse of poles ...there will be no second referendum cabinet ag...scotland welcome to join eu merkel ally sayssterling dips below friday s year low amid br...no negative news about south african president......turkey sorry for downing russian jetedward snowden lawyer vows new push for pardon...brexit opinion poll reveals majority don t wan...conservative mp leave campaigner the leave c...economists predict UNITED KINGDOM recession f...new eu superstate plan by france germany cr...pakistani clerics declare transgender marriage...NaNNaNNaN
198519852016/6/281scientists to australia if you want to save...the personal details of french police office...s amp p cuts united kingdom sovereign credit r...huge helium deposit found in africaceo of the south african state broadcaster qui...brexit cost investors trillion the worst on...hong kong democracy activists call for return ......YEAR old skull from borneo reveals surprise f...palestinians stone western wall worshipers po...jean claude juncker asks farage why are you h...romanians for remainians offering a new home...brexit gibraltar in talks with scotland to st...suicide bombers strike lebanonmexico s security forces routinely use sexual...NaNNaNNaN
198619862016/6/291explosion at airport in istanbulyemeni former president terrorism is the offs...UNITED KINGDOM must accept freedom of movement...devastated scientists too late to captive bre...british labor party leader jeremy corbyn loses...a muslim shop in the UNITED KINGDOM was just f...mexican authorities sexually torture women in ......emaciated lions in taiz zoo are trapped in blo...rupert murdoch describes brexit as wonderful ...more than killed in yemen suicide attacksgoogle found disastrous symantec and norton vu...extremist violence on the rise in germany dom...bbc news labour mps pass corbyn no confidence...tiny new zealand town with too many jobs lau...NaNNaNNaN
198719872016/6/301jamaica proposes marijuana dispensers for tour...stephen hawking says pollution and stupidity ...boris johnson says he will not run for tory pa...six gay men in ivory coast were abused and for...switzerland denies citizenship to muslim immig...palestinian terrorist stabs israeli teen girl ...puerto rico will default on billion of debt ......calls to suspend saudi arabia from un human ri...more than nobel laureates call out greenpeac...british pedophile sentenced to years in us f...us permitted offshore fracks in gulf of mexi...we will be swimming in ridicule french beach...uefa says no minutes of silence for istanbul v...law enforcement sources gun used in paris ter...NaNNaNNaN
198819882016/7/11a year old woman in mexico city finally recei...imf chief backs athens as permanent olympic hostthe president of france says if brexit won so...british man who must give police hours noti...nobel laureates urge greenpeace to stop oppo...brazil huge spike in number of police killing...austria s highest court annuls presidential el......u s sailors detained by iran spoke too much u...mass fish kill in vietnam solved as taiwan ste...philippines president rodrigo duterte urges pe...spain arrests three pakistanis accused of prom...venezuela where anger over food shortages is ...a hindu temple worker has been killed by three...ozone layer hole seems to be healing us amp...NaNNaNNaN
\n", "

1989 rows × 31 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 Date Label \\\n", "0 0 2008/8/8 0 \n", "1 1 2008/8/11 1 \n", "2 2 2008/8/12 0 \n", "3 3 2008/8/13 0 \n", "4 4 2008/8/14 1 \n", "... ... ... ... \n", "1984 1984 2016/6/27 0 \n", "1985 1985 2016/6/28 1 \n", "1986 1986 2016/6/29 1 \n", "1987 1987 2016/6/30 1 \n", "1988 1988 2016/7/1 1 \n", "\n", " Top1 \\\n", "0 b georgia downs two russian warplanes as cou... \n", "1 b why wont america and nato help us if they w... \n", "2 b remember that adorable year old who sang at... \n", "3 b u s refuses israel weapons to attack iran ... \n", "4 b all the experts admit that we should legalis... \n", "... ... \n", "1984 barclays and rbs shares suspended from trading... \n", "1985 scientists to australia if you want to save... \n", "1986 explosion at airport in istanbul \n", "1987 jamaica proposes marijuana dispensers for tour... \n", "1988 a year old woman in mexico city finally recei... \n", "\n", " Top2 \\\n", "0 b breaking musharraf to be impeached \n", "1 b bush puts foot down on georgian conflict \n", "2 b russia ends georgia operation \n", "3 b when the president ordered to attack tskhinv... \n", "4 b war in south osetia pictures made by a r... \n", "... ... \n", "1984 pope says church should ask forgiveness from g... \n", "1985 the personal details of french police office... \n", "1986 yemeni former president terrorism is the offs... \n", "1987 stephen hawking says pollution and stupidity ... \n", "1988 imf chief backs athens as permanent olympic host \n", "\n", " Top3 \\\n", "0 b russia today columns of troops roll into so... \n", "1 b jewish georgian minister thanks to israeli ... \n", "2 b if we had no sexual harassment we would have... \n", "3 b israel clears troops who killed reuters cam... \n", "4 b swedish wrestler ara abrahamian throws away ... \n", "... ... \n", "1984 poland shocked by xenophobic abuse of poles ... \n", "1985 s amp p cuts united kingdom sovereign credit r... \n", "1986 UNITED KINGDOM must accept freedom of movement... \n", "1987 boris johnson says he will not run for tory pa... \n", "1988 the president of france says if brexit won so... \n", "\n", " Top4 \\\n", "0 b russian tanks are moving towards the capital... \n", "1 b georgian army flees in disarray as russians ... \n", "2 b al qa eda is losing support in iraq because ... \n", "3 b britain s policy of being tough on drugs is ... \n", "4 b russia exaggerated the death toll in south o... \n", "... ... \n", "1984 there will be no second referendum cabinet ag... \n", "1985 huge helium deposit found in africa \n", "1986 devastated scientists too late to captive bre... \n", "1987 six gay men in ivory coast were abused and for... \n", "1988 british man who must give police hours noti... \n", "\n", " Top5 \\\n", "0 b afghan children raped with impunity u n o... \n", "1 b olympic opening ceremony fireworks faked \n", "2 b ceasefire in georgia putin outmaneuvers the... \n", "3 b body of year old found in trunk latest r... \n", "4 b missile that killed inside pakistan may ha... \n", "... ... \n", "1984 scotland welcome to join eu merkel ally says \n", "1985 ceo of the south african state broadcaster qui... \n", "1986 british labor party leader jeremy corbyn loses... \n", "1987 switzerland denies citizenship to muslim immig... \n", "1988 nobel laureates urge greenpeace to stop oppo... \n", "\n", " Top6 \\\n", "0 b russian tanks have entered south ossetia wh... \n", "1 b what were the mossad with fraudulent new zea... \n", "2 b why microsoft and intel tried to kill the xo... \n", "3 b china has moved million quake survivors ... \n", "4 b rushdie condemns random house s refusal to p... \n", "... ... \n", "1984 sterling dips below friday s year low amid br... \n", "1985 brexit cost investors trillion the worst on... \n", "1986 a muslim shop in the UNITED KINGDOM was just f... \n", "1987 palestinian terrorist stabs israeli teen girl ... \n", "1988 brazil huge spike in number of police killing... \n", "\n", " Top7 ... \\\n", "0 b breaking georgia invades south ossetia rus... ... \n", "1 b russia angered by israeli military sale to g... ... \n", "2 b stratfor the russo georgian war and the bal... ... \n", "3 b bush announces operation get all up in russi... ... \n", "4 b poland and us agree to missle defense deal ... ... \n", "... ... ... \n", "1984 no negative news about south african president... ... \n", "1985 hong kong democracy activists call for return ... ... \n", "1986 mexican authorities sexually torture women in ... ... \n", "1987 puerto rico will default on billion of debt ... ... \n", "1988 austria s highest court annuls presidential el... ... \n", "\n", " Top19 \\\n", "0 b this is a busy day the european union has ... \n", "1 b china to overtake us as largest manufacturer \n", "2 b russia georgia and nato cold war two \n", "3 b russian convoy heads into georgia violating... \n", "4 b non media photos of south ossetia georgia c... \n", "... ... \n", "1984 turkey sorry for downing russian jet \n", "1985 YEAR old skull from borneo reveals surprise f... \n", "1986 emaciated lions in taiz zoo are trapped in blo... \n", "1987 calls to suspend saudi arabia from un human ri... \n", "1988 u s sailors detained by iran spoke too much u... \n", "\n", " Top20 \\\n", "0 b georgia will withdraw soldiers from iraq t... \n", "1 b war in south ossetia pics \n", "2 b remember that adorable year old who led you... \n", "3 b israeli defence minister us against strike ... \n", "4 b georgian tv reporter shot by russian sniper ... \n", "... ... \n", "1984 edward snowden lawyer vows new push for pardon... \n", "1985 palestinians stone western wall worshipers po... \n", "1986 rupert murdoch describes brexit as wonderful ... \n", "1987 more than nobel laureates call out greenpeac... \n", "1988 mass fish kill in vietnam solved as taiwan ste... \n", "\n", " Top21 \\\n", "0 b why the pentagon thinks attacking iran is a ... \n", "1 b israeli physicians group condemns state tort... \n", "2 b war in georgia the israeli connection \n", "3 b gorbachev we had no choice \n", "4 b saudi arabia mother moves to block child ma... \n", "... ... \n", "1984 brexit opinion poll reveals majority don t wan... \n", "1985 jean claude juncker asks farage why are you h... \n", "1986 more than killed in yemen suicide attacks \n", "1987 british pedophile sentenced to years in us f... \n", "1988 philippines president rodrigo duterte urges pe... \n", "\n", " Top22 \\\n", "0 b caucasus in crisis georgia invades south os... \n", "1 b russia has just beaten the united states ov... \n", "2 b all signs point to the us encouraging georgi... \n", "3 b witness russian forces head towards tbilisi... \n", "4 b taliban wages war on humanitarian aid workers \n", "... ... \n", "1984 conservative mp leave campaigner the leave c... \n", "1985 romanians for remainians offering a new home... \n", "1986 google found disastrous symantec and norton vu... \n", "1987 us permitted offshore fracks in gulf of mexi... \n", "1988 spain arrests three pakistanis accused of prom... \n", "\n", " Top23 \\\n", "0 b indian shoe manufactory and again in a se... \n", "1 b perhaps the question about the georgia r... \n", "2 b christopher king argues that the us and nato... \n", "3 b quarter of russians blame u s for conflict... \n", "4 b russia world can forget about georgia s ... \n", "... ... \n", "1984 economists predict UNITED KINGDOM recession f... \n", "1985 brexit gibraltar in talks with scotland to st... \n", "1986 extremist violence on the rise in germany dom... \n", "1987 we will be swimming in ridicule french beach... \n", "1988 venezuela where anger over food shortages is ... \n", "\n", " Top24 \\\n", "0 b visitors suffering from mental illnesses ban... \n", "1 b russia is so much better at war \n", "2 b america the new mexico \n", "3 b georgian president says us military will ta... \n", "4 b darfur rebels accuse sudan of mounting major... \n", "... ... \n", "1984 new eu superstate plan by france germany cr... \n", "1985 suicide bombers strike lebanon \n", "1986 bbc news labour mps pass corbyn no confidence... \n", "1987 uefa says no minutes of silence for istanbul v... \n", "1988 a hindu temple worker has been killed by three... \n", "\n", " Top25 Unnamed: 28 \\\n", "0 b no help for mexico s kidnapping surge NaN \n", "1 b so this is what it s come to trading sex fo... NaN \n", "2 b bbc news asia pacific extinction by man... NaN \n", "3 b nobel laureate aleksander solzhenitsyn accu... NaN \n", "4 b philippines peace advocate say muslims nee... NaN \n", "... ... ... \n", "1984 pakistani clerics declare transgender marriage... NaN \n", "1985 mexico s security forces routinely use sexual... NaN \n", "1986 tiny new zealand town with too many jobs lau... NaN \n", "1987 law enforcement sources gun used in paris ter... NaN \n", "1988 ozone layer hole seems to be healing us amp... NaN \n", "\n", " Unnamed: 29 fluctuation \n", "0 NaN 0.055658 \n", "1 NaN -1.793642 \n", "2 NaN -0.757185 \n", "3 NaN 1.207153 \n", "4 NaN 1.795697 \n", "... ... ... \n", "1984 NaN NaN \n", "1985 NaN NaN \n", "1986 NaN NaN \n", "1987 NaN NaN \n", "1988 NaN NaN \n", "\n", "[1989 rows x 31 columns]" ] }, "metadata": {}, "execution_count": 47 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 31, "source": [ "def preprocess(processdata):\n", " # 轉小寫\n", " headlines = []\n", " for i in range(1, 21):\n", " headlines.append('Top'+str(i))\n", " processdata[headlines] = processdata[headlines].astype(str)\n", " processdata[headlines] = processdata[headlines].applymap(str.lower)\n", " \n", " # 組成以天為單位的data\n", " processdata_headlines = []\n", " for row in range(0,len(processdata.index)):\n", " processdata_headlines.append(' '.join(str(x) for x in processdata.iloc[row,2:27]))\n", "\n", " # remove punctuation characters\n", " for line in range(len(processdata_headlines)):\n", " processdata_headlines[line] = re.sub(r'[^A-Za-z]',\" \", processdata_headlines[line])\n", "\n", " # 切字\n", " for sentence in range(len(processdata_headlines)):\n", " processdata_headlines[sentence] = word_tokenize(processdata_headlines[sentence]) \n", "\n", " # 去除停用詞\n", " alpha = []\n", " for abc in string.ascii_lowercase :\n", " alpha.append(abc) \n", " en_stops = stopwords.words('english')\n", " en_stops.extend(alpha)\n", " for sentence in range(len(processdata_headlines)):\n", " processdata_headlines[sentence] = [w for w in processdata_headlines[sentence] if w not in en_stops] \n", " \n", " # 單字變回原形\n", " for sentence in range(len(processdata_headlines)):\n", " processdata_headlines[sentence] = [WordNetLemmatizer().lemmatize(w) for w in processdata_headlines[sentence]]\n", " processdata_headlines[sentence] = [WordNetLemmatizer().lemmatize(w, pos='v') for w in processdata_headlines[sentence]] \n", "\n", " # 組回標題\n", " final_processdata_headlines = []\n", " for words in processdata_headlines :\n", " filter_words = \"\"\n", " for i in range(len(words)) :\n", " filter_words = filter_words + words[i] + \" \"\n", " final_processdata_headlines.append(filter_words) \n", "\n", " return final_processdata_headlines " ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 32, "source": [ "#將資料進行tfidf權重評分\n", "train = data[data['Date'] < '2015-01-01']\n", "test = data[data['Date'] > '2014-12-31']\n", "final_traindata = preprocess(train)\n", "final_testdata = preprocess(test)\n", "tfidf_vector = TfidfVectorizer(min_df=0.01, max_df=0.99, max_features=160, ngram_range=(2, 2))\n", "final_traindata_tfidf = tfidf_vector.fit_transform(final_traindata)\n", "final_testdata_tfidf = tfidf_vector.transform(final_testdata)" ], "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/Users/liuruizhe/miniforge3/lib/python3.9/site-packages/pandas/core/frame.py:3636: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " self[k1] = value[k2]\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 33, "source": [ "#印出字詞及其tfidf\n", "terms = tfidf_vector.get_feature_names()\n", "sums = final_traindata_tfidf.sum(axis=0)\n", "data = []\n", "for col, term in enumerate(terms):\n", " data.append( (term, sums[0,col] ))\n", "\n", "ranking = pd.DataFrame(data, columns=['term','tfidf'])\n", "print(ranking)\n" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " term tfidf\n", "0 air force 15.454863\n", "1 air strike 22.767039\n", "2 al jazeera 38.015786\n", "3 al qaeda 40.412949\n", "4 al qaida 19.172821\n", ".. ... ...\n", "155 world war 28.663142\n", "156 year ago 32.269653\n", "157 year jail 16.555130\n", "158 year old 110.372398\n", "159 year prison 26.403931\n", "\n", "[160 rows x 2 columns]\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 53, "source": [ "#列印出處理後資料集matrix\n", "dense = final_traindata_tfidf.todense()\n", "denselist = dense.tolist()\n", "df2 = pd.DataFrame(denselist, columns=terms)\n", "df2\n", "\n" ], "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
air forceair strikeal jazeeraal qaedaal qaidaamnesty internationalanti gayaround worldaustralian governmentbarack obama...world biggestworld cupworld firstworld largestworld newsworld waryear agoyear jailyear oldyear prison
00.00.000000.00.3617910.00.00.00.00.00.000000...0.00.0000000.00.00.0000000.4007270.00.000000.2282410.0
10.00.000000.00.0000000.00.00.00.00.00.000000...0.00.0000000.00.00.0000000.6713480.00.000000.0000000.0
20.00.000000.00.0000000.00.00.00.00.00.000000...0.00.0000000.00.00.0000000.0000000.00.000000.7902200.0
30.00.000000.00.0000000.00.00.00.00.00.000000...0.00.0000000.00.00.0000000.0000000.00.000000.3375380.0
40.00.000000.00.0000000.00.00.00.00.00.000000...0.00.0000000.00.00.3751070.0000000.00.000000.0000000.0
..................................................................
16060.00.000000.00.0000000.00.00.00.00.00.000000...0.00.7804270.00.00.0000000.0000000.00.000000.0000000.0
16070.00.000000.00.0000000.00.00.00.00.00.000000...0.00.0000000.00.00.0000000.0000000.00.000000.0000000.0
16080.00.000000.00.0000000.00.00.00.00.00.607745...0.00.0000000.00.00.0000000.0000000.00.000000.0000000.0
16090.00.000000.00.0000000.00.00.00.00.00.000000...0.00.0000000.00.00.4127630.0000000.00.416530.0000000.0
16100.00.380310.00.0000000.00.00.00.00.00.000000...0.00.0000000.00.00.0000000.0000000.00.000000.0000000.0
\n", "

1611 rows × 160 columns

\n", "
" ], "text/plain": [ " air force air strike al jazeera al qaeda al qaida \\\n", "0 0.0 0.00000 0.0 0.361791 0.0 \n", "1 0.0 0.00000 0.0 0.000000 0.0 \n", "2 0.0 0.00000 0.0 0.000000 0.0 \n", "3 0.0 0.00000 0.0 0.000000 0.0 \n", "4 0.0 0.00000 0.0 0.000000 0.0 \n", "... ... ... ... ... ... \n", "1606 0.0 0.00000 0.0 0.000000 0.0 \n", "1607 0.0 0.00000 0.0 0.000000 0.0 \n", "1608 0.0 0.00000 0.0 0.000000 0.0 \n", "1609 0.0 0.00000 0.0 0.000000 0.0 \n", "1610 0.0 0.38031 0.0 0.000000 0.0 \n", "\n", " amnesty international anti gay around world australian government \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "1606 0.0 0.0 0.0 0.0 \n", "1607 0.0 0.0 0.0 0.0 \n", "1608 0.0 0.0 0.0 0.0 \n", "1609 0.0 0.0 0.0 0.0 \n", "1610 0.0 0.0 0.0 0.0 \n", "\n", " barack obama ... world biggest world cup world first world largest \\\n", "0 0.000000 ... 0.0 0.000000 0.0 0.0 \n", "1 0.000000 ... 0.0 0.000000 0.0 0.0 \n", "2 0.000000 ... 0.0 0.000000 0.0 0.0 \n", "3 0.000000 ... 0.0 0.000000 0.0 0.0 \n", "4 0.000000 ... 0.0 0.000000 0.0 0.0 \n", "... ... ... ... ... ... ... \n", "1606 0.000000 ... 0.0 0.780427 0.0 0.0 \n", "1607 0.000000 ... 0.0 0.000000 0.0 0.0 \n", "1608 0.607745 ... 0.0 0.000000 0.0 0.0 \n", "1609 0.000000 ... 0.0 0.000000 0.0 0.0 \n", "1610 0.000000 ... 0.0 0.000000 0.0 0.0 \n", "\n", " world news world war year ago year jail year old year prison \n", "0 0.000000 0.400727 0.0 0.00000 0.228241 0.0 \n", "1 0.000000 0.671348 0.0 0.00000 0.000000 0.0 \n", "2 0.000000 0.000000 0.0 0.00000 0.790220 0.0 \n", "3 0.000000 0.000000 0.0 0.00000 0.337538 0.0 \n", "4 0.375107 0.000000 0.0 0.00000 0.000000 0.0 \n", "... ... ... ... ... ... ... \n", "1606 0.000000 0.000000 0.0 0.00000 0.000000 0.0 \n", "1607 0.000000 0.000000 0.0 0.00000 0.000000 0.0 \n", "1608 0.000000 0.000000 0.0 0.00000 0.000000 0.0 \n", "1609 0.412763 0.000000 0.0 0.41653 0.000000 0.0 \n", "1610 0.000000 0.000000 0.0 0.00000 0.000000 0.0 \n", "\n", "[1611 rows x 160 columns]" ] }, "metadata": {}, "execution_count": 53 } ], "metadata": {} } ], "metadata": { "orig_nbformat": 4, "language_info": { "name": "python", "version": "3.9.6", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kernelspec": { "name": "python3", "display_name": "Python 3.9.6 64-bit ('base': conda)" }, "interpreter": { "hash": "b4f9c837a52ebafc15ae09e90d6367eac1bb462061178a53d70c00ad6c75d32e" } }, "nbformat": 4, "nbformat_minor": 2 }