{ "cells": [ { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T21:29:49.840309Z", "start_time": "2018-11-02T21:29:49.835307Z" } }, "outputs": [], "source": [ "import bs4 as bs, urllib, pandas as pd, numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Parse past X years" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T21:29:52.964758Z", "start_time": "2018-11-02T21:29:52.960219Z" } }, "outputs": [], "source": [ "keyword='medve'\n", "baseurl=u'https://szekelyhon.ro/kereses?op=search&src_words='" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T21:30:03.654752Z", "start_time": "2018-11-02T21:30:03.635756Z" } }, "outputs": [], "source": [ "start='2002-12'\n", "#end='2018-11-01'\n", "#start='2018-10'\n", "end='2018-12'\n", "dates=[]\n", "datelist = pd.date_range(start=pd.to_datetime(start), end=pd.to_datetime(end), freq='M').tolist()\n", "for date in datelist:\n", " dates.append(str(date)[:10])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T21:30:04.371597Z", "start_time": "2018-11-02T21:30:04.364599Z" } }, "outputs": [ { "data": { "text/plain": [ "['2002-12-31', '2003-01-31', '2003-02-28', '2003-03-31', '2003-04-30']" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates[:5]" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T21:44:57.426093Z", "start_time": "2018-11-02T21:44:57.419089Z" } }, "outputs": [], "source": [ "def extractor(time1,time2):\n", " time1=dates[i]\n", " time2=dates[i+1]\n", " print('Parsing...',time1,'-',time2)\n", " url=baseurl+keyword+'&src_time1='+time1+'&src_time2='+time2\n", " html = urllib.request.urlopen(url).read()\n", " soup = bs.BeautifulSoup(html,'lxml')\n", " return soup.findAll(\"div\", {\"class\": \"cikkocka2c\"})" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T21:47:12.034704Z", "start_time": "2018-11-02T21:44:59.513282Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parsing... 2002-12-31 - 2003-01-31\n", "Parsing... 2003-01-31 - 2003-02-28\n", "Parsing... 2003-02-28 - 2003-03-31\n", "Parsing... 2003-03-31 - 2003-04-30\n", "Parsing... 2003-04-30 - 2003-05-31\n", "Parsing... 2003-05-31 - 2003-06-30\n", "Parsing... 2003-06-30 - 2003-07-31\n", "Parsing... 2003-07-31 - 2003-08-31\n", "Parsing... 2003-08-31 - 2003-09-30\n", "Parsing... 2003-09-30 - 2003-10-31\n", "Parsing... 2003-10-31 - 2003-11-30\n", "Parsing... 2003-11-30 - 2003-12-31\n", "Parsing... 2003-12-31 - 2004-01-31\n", "Parsing... 2004-01-31 - 2004-02-29\n", "Parsing... 2004-02-29 - 2004-03-31\n", "Parsing... 2004-03-31 - 2004-04-30\n", "Parsing... 2004-04-30 - 2004-05-31\n", "Parsing... 2004-05-31 - 2004-06-30\n", "Parsing... 2004-06-30 - 2004-07-31\n", "Parsing... 2004-07-31 - 2004-08-31\n", "Parsing... 2004-08-31 - 2004-09-30\n", "Parsing... 2004-09-30 - 2004-10-31\n", "Parsing... 2004-10-31 - 2004-11-30\n", "Parsing... 2004-11-30 - 2004-12-31\n", "Parsing... 2004-12-31 - 2005-01-31\n", "Parsing... 2005-01-31 - 2005-02-28\n", "Parsing... 2005-02-28 - 2005-03-31\n", "Parsing... 2005-03-31 - 2005-04-30\n", "Parsing... 2005-04-30 - 2005-05-31\n", "Parsing... 2005-05-31 - 2005-06-30\n", "Parsing... 2005-06-30 - 2005-07-31\n", "Parsing... 2005-07-31 - 2005-08-31\n", "Parsing... 2005-08-31 - 2005-09-30\n", "Parsing... 2005-09-30 - 2005-10-31\n", "Parsing... 2005-10-31 - 2005-11-30\n", "Parsing... 2005-11-30 - 2005-12-31\n", "Parsing... 2005-12-31 - 2006-01-31\n", "Parsing... 2006-01-31 - 2006-02-28\n", "Parsing... 2006-02-28 - 2006-03-31\n", "Parsing... 2006-03-31 - 2006-04-30\n", "Parsing... 2006-04-30 - 2006-05-31\n", "Parsing... 2006-05-31 - 2006-06-30\n", "Parsing... 2006-06-30 - 2006-07-31\n", "Parsing... 2006-07-31 - 2006-08-31\n", "Parsing... 2006-08-31 - 2006-09-30\n", "Parsing... 2006-09-30 - 2006-10-31\n", "Parsing... 2006-10-31 - 2006-11-30\n", "Parsing... 2006-11-30 - 2006-12-31\n", "Parsing... 2006-12-31 - 2007-01-31\n", "Parsing... 2007-01-31 - 2007-02-28\n", "Parsing... 2007-02-28 - 2007-03-31\n", "Parsing... 2007-03-31 - 2007-04-30\n", "Parsing... 2007-04-30 - 2007-05-31\n", "Parsing... 2007-05-31 - 2007-06-30\n", "Parsing... 2007-06-30 - 2007-07-31\n", "Parsing... 2007-07-31 - 2007-08-31\n", "Parsing... 2007-08-31 - 2007-09-30\n", "Parsing... 2007-09-30 - 2007-10-31\n", "Parsing... 2007-10-31 - 2007-11-30\n", "Parsing... 2007-11-30 - 2007-12-31\n", "Parsing... 2007-12-31 - 2008-01-31\n", "Parsing... 2008-01-31 - 2008-02-29\n", "Parsing... 2008-02-29 - 2008-03-31\n", "Parsing... 2008-03-31 - 2008-04-30\n", "Parsing... 2008-04-30 - 2008-05-31\n", "Parsing... 2008-05-31 - 2008-06-30\n", "Parsing... 2008-06-30 - 2008-07-31\n", "Parsing... 2008-07-31 - 2008-08-31\n", "Parsing... 2008-08-31 - 2008-09-30\n", "Parsing... 2008-09-30 - 2008-10-31\n", "Parsing... 2008-10-31 - 2008-11-30\n", "Parsing... 2008-11-30 - 2008-12-31\n", "Parsing... 2008-12-31 - 2009-01-31\n", "Parsing... 2009-01-31 - 2009-02-28\n", "Parsing... 2009-02-28 - 2009-03-31\n", "Parsing... 2009-03-31 - 2009-04-30\n", "Parsing... 2009-04-30 - 2009-05-31\n", "Parsing... 2009-05-31 - 2009-06-30\n", "Parsing... 2009-06-30 - 2009-07-31\n", "Parsing... 2009-07-31 - 2009-08-31\n", "Parsing... 2009-08-31 - 2009-09-30\n", "Parsing... 2009-09-30 - 2009-10-31\n", "Parsing... 2009-10-31 - 2009-11-30\n", "Parsing... 2009-11-30 - 2009-12-31\n", "Parsing... 2009-12-31 - 2010-01-31\n", "Parsing... 2010-01-31 - 2010-02-28\n", "Parsing... 2010-02-28 - 2010-03-31\n", "Parsing... 2010-03-31 - 2010-04-30\n", "Parsing... 2010-04-30 - 2010-05-31\n", "Parsing... 2010-05-31 - 2010-06-30\n", "Parsing... 2010-06-30 - 2010-07-31\n", "Parsing... 2010-07-31 - 2010-08-31\n", "Parsing... 2010-08-31 - 2010-09-30\n", "Parsing... 2010-09-30 - 2010-10-31\n", "Parsing... 2010-10-31 - 2010-11-30\n", "Parsing... 2010-11-30 - 2010-12-31\n", "Parsing... 2010-12-31 - 2011-01-31\n", "Parsing... 2011-01-31 - 2011-02-28\n", "Parsing... 2011-02-28 - 2011-03-31\n", "Parsing... 2011-03-31 - 2011-04-30\n", "Parsing... 2011-04-30 - 2011-05-31\n", "Parsing... 2011-05-31 - 2011-06-30\n", "Parsing... 2011-06-30 - 2011-07-31\n", "Parsing... 2011-07-31 - 2011-08-31\n", "Parsing... 2011-08-31 - 2011-09-30\n", "Parsing... 2011-09-30 - 2011-10-31\n", "Parsing... 2011-10-31 - 2011-11-30\n", "Parsing... 2011-11-30 - 2011-12-31\n", "Parsing... 2011-12-31 - 2012-01-31\n", "Parsing... 2012-01-31 - 2012-02-29\n", "Parsing... 2012-02-29 - 2012-03-31\n", "Parsing... 2012-03-31 - 2012-04-30\n", "Parsing... 2012-04-30 - 2012-05-31\n", "Parsing... 2012-05-31 - 2012-06-30\n", "Parsing... 2012-06-30 - 2012-07-31\n", "Parsing... 2012-07-31 - 2012-08-31\n", "Parsing... 2012-08-31 - 2012-09-30\n", "Parsing... 2012-09-30 - 2012-10-31\n", "Parsing... 2012-10-31 - 2012-11-30\n", "Parsing... 2012-11-30 - 2012-12-31\n", "Parsing... 2012-12-31 - 2013-01-31\n", "Parsing... 2013-01-31 - 2013-02-28\n", "Parsing... 2013-02-28 - 2013-03-31\n", "Parsing... 2013-03-31 - 2013-04-30\n", "Parsing... 2013-04-30 - 2013-05-31\n", "Parsing... 2013-05-31 - 2013-06-30\n", "Parsing... 2013-06-30 - 2013-07-31\n", "Parsing... 2013-07-31 - 2013-08-31\n", "Parsing... 2013-08-31 - 2013-09-30\n", "Parsing... 2013-09-30 - 2013-10-31\n", "Parsing... 2013-10-31 - 2013-11-30\n", "Parsing... 2013-11-30 - 2013-12-31\n", "Parsing... 2013-12-31 - 2014-01-31\n", "Parsing... 2014-01-31 - 2014-02-28\n", "Parsing... 2014-02-28 - 2014-03-31\n", "Parsing... 2014-03-31 - 2014-04-30\n", "Parsing... 2014-04-30 - 2014-05-31\n", "Parsing... 2014-05-31 - 2014-06-30\n", "Parsing... 2014-06-30 - 2014-07-31\n", "Parsing... 2014-07-31 - 2014-08-31\n", "Parsing... 2014-08-31 - 2014-09-30\n", "Parsing... 2014-09-30 - 2014-10-31\n", "Parsing... 2014-10-31 - 2014-11-30\n", "Parsing... 2014-11-30 - 2014-12-31\n", "Parsing... 2014-12-31 - 2015-01-31\n", "Parsing... 2015-01-31 - 2015-02-28\n", "Parsing... 2015-02-28 - 2015-03-31\n", "Parsing... 2015-03-31 - 2015-04-30\n", "Parsing... 2015-04-30 - 2015-05-31\n", "Parsing... 2015-05-31 - 2015-06-30\n", "Parsing... 2015-06-30 - 2015-07-31\n", "Parsing... 2015-07-31 - 2015-08-31\n", "Parsing... 2015-08-31 - 2015-09-30\n", "Parsing... 2015-09-30 - 2015-10-31\n", "Parsing... 2015-10-31 - 2015-11-30\n", "Parsing... 2015-11-30 - 2015-12-31\n", "Parsing... 2015-12-31 - 2016-01-31\n", "Parsing... 2016-01-31 - 2016-02-29\n", "Parsing... 2016-02-29 - 2016-03-31\n", "Parsing... 2016-03-31 - 2016-04-30\n", "Parsing... 2016-04-30 - 2016-05-31\n", "Parsing... 2016-05-31 - 2016-06-30\n", "Parsing... 2016-06-30 - 2016-07-31\n", "Parsing... 2016-07-31 - 2016-08-31\n", "Parsing... 2016-08-31 - 2016-09-30\n", "Parsing... 2016-09-30 - 2016-10-31\n", "Parsing... 2016-10-31 - 2016-11-30\n", "Parsing... 2016-11-30 - 2016-12-31\n", "Parsing... 2016-12-31 - 2017-01-31\n", "Parsing... 2017-01-31 - 2017-02-28\n", "Parsing... 2017-02-28 - 2017-03-31\n", "Parsing... 2017-03-31 - 2017-04-30\n", "Parsing... 2017-04-30 - 2017-05-31\n", "Parsing... 2017-05-31 - 2017-06-30\n", "Parsing... 2017-06-30 - 2017-07-31\n", "Parsing... 2017-07-31 - 2017-08-31\n", "Parsing... 2017-08-31 - 2017-09-30\n", "Parsing... 2017-09-30 - 2017-10-31\n", "Parsing... 2017-10-31 - 2017-11-30\n", "Parsing... 2017-11-30 - 2017-12-31\n", "Parsing... 2017-12-31 - 2018-01-31\n", "Parsing... 2018-01-31 - 2018-02-28\n", "Parsing... 2018-02-28 - 2018-03-31\n", "Parsing... 2018-03-31 - 2018-04-30\n", "Parsing... 2018-04-30 - 2018-05-31\n", "Parsing... 2018-05-31 - 2018-06-30\n", "Parsing... 2018-06-30 - 2018-07-31\n", "Parsing... 2018-07-31 - 2018-08-31\n", "Parsing... 2018-08-31 - 2018-09-30\n", "Parsing... 2018-09-30 - 2018-10-31\n", "Parsing... 2018-10-31 - 2018-11-30\n" ] } ], "source": [ "divs=[]\n", "for i in range(len(dates)-1):\n", " time1=dates[i]\n", " time2=dates[i+1]\n", " divs.append(extractor(time1,time2))" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T21:50:32.474662Z", "start_time": "2018-11-02T21:50:32.468672Z" } }, "outputs": [], "source": [ "def date_hu_en(i):\n", " date=i[6:-4]\n", " if date=='augusztus': m='08'\n", " elif date=='december': m='12'\n", " elif date=='február': m='02'\n", " elif date=='január': m='01'\n", " elif date=='július': m='07'\n", " elif date=='június': m='06'\n", " elif date=='május': m='05'\n", " elif date=='március': m='03'\n", " elif date=='november': m='11'\n", " elif date==u'október': m='10'\n", " elif date==u'szeptember': m='09'\n", " elif date==u'április': m='04'\n", " else: return date\n", " return i[:4]+'-'+m+'-'+i[-3:-1]" ] }, { "cell_type": "code", "execution_count": 189, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T22:26:20.044517Z", "start_time": "2018-11-02T22:26:20.039521Z" } }, "outputs": [], "source": [ "def find_all(s, ch):\n", " return [i for i, letter in enumerate(s) if letter == ch]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Relevant = Medves cikk-e vagy sem: 1-igen, 0-nem biztos, -1:biztosan nem \n", "Deaths = Halalok szama (ha ismert) \n", "Severity = Sulyossag: 0-mas jellegu hir, 1-latas, 2-allat-tamadas, 3-ember-tamadas \n", "Duplicate = 0: Eredeti cikk, 1: Masolat, 2: Osszegzes" ] }, { "cell_type": "code", "execution_count": 335, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:33:34.743906Z", "start_time": "2018-11-02T23:33:34.732903Z" } }, "outputs": [], "source": [ "def text_processor(title,content):\n", " relevant=0\n", " severity=0\n", " deaths=0\n", " tamadas=[u'támad',u'sebes']\n", " for i in tamadas:\n", " if i in title+content:\n", " relevant=1\n", " severity=2\n", " tamadas=[u'halál',u'áldozat',u'ölt ',u'pusztít']\n", " for i in tamadas:\n", " if i in title+content:\n", " relevant=1\n", " severity=3\n", " tamadas=[u'medve',u'medvé']\n", " for i in tamadas:\n", " if i in title.replace(',',' ').replace('.',' ').lower():\n", " relevant=1\n", " tamadas=[u'medvegyev',u'jegesmedvék',u'medvehagyma',u'aranymedve',u'szibéria',u' kupa',\n", " u'jégkorong',u'kosárlabda',u'labdarúgás',u'labdarúgó',u'steaua',\n", " u'c osztály',u'berlin',u'állatkert',u'medve-tó',u'oroszorsz',u' orosz ']\n", " for i in tamadas:\n", " if i in (title+content).replace(',',' ').replace('.',' ').lower():\n", " relevant=-1\n", " return relevant,severity,deaths" ] }, { "cell_type": "code", "execution_count": 336, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:33:37.572899Z", "start_time": "2018-11-02T23:33:35.465906Z" } }, "outputs": [], "source": [ "hirek=[]\n", "tagset=set()\n", "for i in range(len(dates)-1):\n", " time2=dates[i+1]\n", " divgroup=divs[i]\n", " for div in divgroup:\n", " icat=''\n", " img=div.find('img')\n", " if img !=None: \n", " img=img['src']\n", " #infer image category from image link\n", " icats=find_all(img,'/')\n", " if len(icats)>4:\n", " icat=img[icats[3]+1:icats[4]]\n", " tags=div.find(\"div\", {\"class\": \"tags_con1\"})\n", " if tags!=None: \n", " tags=[j.text.strip() for j in tags.findAll('div')]\n", " idiv=div.find(\"div\", {\"class\": \"catinner\"})\n", " if idiv!=None:\n", " idiv=idiv.find('div')\n", " content=div.find('p')\n", " date=idiv.text[idiv.text.find('20'):idiv.text.find(',')]\n", " title=div.find('h2').text\n", " if content==None:\n", " sdiv=str(div)[::-1]\n", " content=sdiv[:sdiv.find('>a/<')].replace('\\r','').replace('\\t','').replace('\\n','')[::-1][:-6]\n", " else: content=content.text\n", " content=content.replace('
','')\n", " link=div.findAll('a')[-1]['href']\n", " #infer category from link\n", " cats=find_all(link,'/')\n", " if len(cats)>3:\n", " cat=link[cats[2]+1:cats[3]]\n", " else: cat=''\n", " #infer attack from plain text\n", " relevant,severity,deaths=text_processor(title,content)\n", " if tags!=None:\n", " notags=[u'Húsvét',u'Film',u'Egészségügy',u'Külföld',u'Színház',u'Ünnep']\n", " for notag in notags:\n", " if notag in tags:\n", " relevant=-1\n", " break\n", " if ((relevant>-1)&\\\n", " (cat not in ['sport','muvelodes','sms-e-mail-velemeny','tusvanyos'])&\\\n", " (title not in [u'Röviden'])):\n", " if tags!=None: \n", " tagset=tagset.union(set(tags))\n", " if 'medve' in tags:\n", " relevant=1\n", " hirek.append({'date':date_hu_en(date),\n", " 'hudate':date,\n", " 'title':title,\n", " 'image':img,\n", " 'tags':repr(tags),\n", " 'content':content,\n", " 'link':link,\n", " 'category':cat,\n", " 'icategory':icat,\n", " 'relevant':relevant,\n", " 'severity':severity,\n", " 'deaths':deaths,\n", " 'duplicate':0\n", " })" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Összes medvés hír" ] }, { "cell_type": "code", "execution_count": 337, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:33:37.640906Z", "start_time": "2018-11-02T23:33:37.577901Z" } }, "outputs": [], "source": [ "df=pd.DataFrame().from_dict(hirek)\n", "df['date']=pd.to_datetime(df['date'])\n", "df=df.sort_values('date').drop_duplicates().reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 338, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:33:37.653905Z", "start_time": "2018-11-02T23:33:37.645906Z" } }, "outputs": [ { "data": { "text/plain": [ "1168" ] }, "execution_count": 338, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(hirek)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Save to medve Excel. Manual curation" ] }, { "cell_type": "code", "execution_count": 339, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:33:41.259906Z", "start_time": "2018-11-02T23:33:41.251906Z" } }, "outputs": [ { "data": { "text/plain": [ "Index(['category', 'content', 'date', 'deaths', 'duplicate', 'hudate',\n", " 'icategory', 'image', 'link', 'relevant', 'severity', 'tags', 'title'],\n", " dtype='object')" ] }, "execution_count": 339, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 345, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:35:20.799334Z", "start_time": "2018-11-02T23:35:20.792335Z" }, "code_folding": [] }, "outputs": [], "source": [ "dm=df[[ 'date', 'hudate', 'link','image', 'category','icategory','tags','title',\n", " 'content']]\n", "dc=df[['title','content','relevant', 'severity','deaths','duplicate']]" ] }, { "cell_type": "code", "execution_count": 348, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:35:52.590692Z", "start_time": "2018-11-02T23:35:51.537703Z" } }, "outputs": [], "source": [ "#save parsed data\n", "dm.to_excel('szekelyhon_medve.xlsx')" ] }, { "cell_type": "code", "execution_count": 361, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:39:45.311852Z", "start_time": "2018-11-02T23:39:44.899837Z" } }, "outputs": [], "source": [ "#save data for curation\n", "#1 if you dont have savedata yet\n", "existing_savedata=False\n", "if not existing_savedata:\n", " dc.to_excel('szekelyhon_medve_curated.xlsx')\n", "#2 if you already have savedata\n", "else:\n", " dc2=pd.read_excel('szekelyhon_medve_curated.xlsx')\n", " dc2.combine_first(dc).to_excel('szekelyhon_medve_curated.xlsx')" ] }, { "cell_type": "code", "execution_count": 379, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:45:43.993762Z", "start_time": "2018-11-02T23:45:43.624182Z" } }, "outputs": [], "source": [ "dr=pd.read_excel('data/szekelyhon_medve_curated_manual.xlsx')" ] }, { "cell_type": "code", "execution_count": 380, "metadata": { "ExecuteTime": { "end_time": "2018-11-02T23:45:44.043745Z", "start_time": "2018-11-02T23:45:44.035740Z" } }, "outputs": [], "source": [ "dr=dr[['content','relevant','deaths','severity','duplicate']].set_index('content')" ] }, { "cell_type": "code", "execution_count": 465, "metadata": { "ExecuteTime": { "end_time": "2018-11-03T00:06:55.264347Z", "start_time": "2018-11-03T00:06:52.954352Z" } }, "outputs": [], "source": [ "relevant=[]\n", "deaths=[]\n", "severity=[]\n", "duplicate=[]\n", "for i in range(len(dc.index)):\n", " if dc.loc[i]['content']!='':\n", " if dc.loc[i]['content'] in dr.index:\n", " dk=dr.loc[dc.loc[i]['content']]\n", " else:\n", " dk=dc.loc[i]\n", " else:\n", " dk=dc.loc[i]\n", " relevant.append(np.array(dk['relevant']).flatten()[0])\n", " deaths.append(np.array(dk['deaths']).flatten()[0])\n", " severity.append(np.array(dk['severity']).flatten()[0])\n", " duplicate.append(np.array(dk['duplicate']).flatten()[0])" ] }, { "cell_type": "code", "execution_count": 467, "metadata": { "ExecuteTime": { "end_time": "2018-11-03T00:07:37.674324Z", "start_time": "2018-11-03T00:07:36.072326Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " \"\"\"Entry point for launching an IPython kernel.\n", "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " \n", "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " This is separate from the ipykernel package so we can avoid doing imports until\n", "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " after removing the cwd from sys.path.\n" ] } ], "source": [ "dc['relevant']=relevant\n", "dc['deaths']=deaths\n", "dc['severity']=severity\n", "dc['duplicate']=duplicate" ] }, { "cell_type": "code", "execution_count": 468, "metadata": { "ExecuteTime": { "end_time": "2018-11-03T00:07:56.647215Z", "start_time": "2018-11-03T00:07:56.053223Z" } }, "outputs": [], "source": [ "dc.to_excel('szekelyhon_medve_curated.xlsx')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }