{"cells":[{"cell_type":"markdown","source":["####Welcome to the example notebook for FIFEforSpark! \n\nYou may recognize the following example from FIFE's notebook found [here](https://github.com/IDA-HumanCapital/fife/blob/master/examples/country_leadership.ipynb). Like that example notebook, we use the July 2020 edition of the Rulers, Elections, and Irregular Governance dataset (REIGN) dataset, a monthly panel of national leaders and political conditions since January 1950. We load the REIGN data directly from its online archive.\n\nFirst, we import the necessary packages. In this case, we import SparkFiles which is required to read the data in from the url, in addition to several fifeforspark modules."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"98c3970d-e72b-4104-8949-2137052a49f1"}}},{"cell_type":"code","source":["import pyspark\nfrom pyspark import SparkFiles\nimport fifeforspark\nfrom fifeforspark.utils import create_example_data2\nfrom fifeforspark.processors import PanelDataProcessor\nfrom fifeforspark.lgb_modelers import LGBSurvivalModeler"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c8acc466-0ebc-4ade-85b4-94eeb5e79801"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"markdown","source":["Now that we have the necessary packages loaded, we read in the data from a url:"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"861e7c8a-5725-4d4f-9985-f5915bfab3ec"}}},{"cell_type":"code","source":["url = \"https://www.dl.dropboxusercontent.com/s/3tdswu2jfgwp4xw/REIGN_2020_7.csv?dl=0\"\nspark.sparkContext.addFile(url)\n\ndf = spark.read.csv(\"file://\"+SparkFiles.get(\"REIGN_2020_7.csv\"), header=True, inferSchema= True)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"ca5a1581-14e5-4a90-90ea-47435195b765"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"markdown","source":["The data is stored in a Spark DataFrame which is different than you may expect if you are more familiar with FIFE. Let's examine our data a bit more."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"405ec94a-c629-455c-a96a-3d51cfca7b2c"}}},{"cell_type":"code","source":["df.show(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"21a7d52a-9e3e-4c41-a9e8-f44d7f72b0a7"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
+-----+-------+------+------+-----+-------+----+----+--------------+-------------+--------------------+------------+-------+-------+--------+--------------+------------+---------------+----------+-----------+-----------+----------+-------------+---------------+--------------+-------------+-------------+---------------+-------+------------+---------+---------+-------------+------+----------+-------------------+--------+-----------+\n|ccode|country|leader| year|month|elected| age|male|militarycareer|tenure_months| government|anticipation|ref_ant|leg_ant|exec_ant|irreg_lead_ant|election_now|election_recent|leg_recent|exec_recent|lead_recent|ref_recent|direct_recent|indirect_recent|victory_recent|defeat_recent|change_recent|nochange_recent|delayed|lastelection| loss|irregular|prev_conflict|pt_suc|pt_attempt| precip|couprisk|pctile_risk|\n+-----+-------+------+------+-----+-------+----+----+--------------+-------------+--------------------+------------+-------+-------+--------+--------------+------------+---------------+----------+-----------+-----------+----------+-------------+---------------+--------------+-------------+-------------+---------------+-------+------------+---------+---------+-------------+------+----------+-------------------+--------+-----------+\n| 2.0| USA|Truman|1950.0| 1.0| 1.0|66.0| 1| 0.0| 58.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.6390574| 5.327876| 7.565793| 0.0| 0.0| 0.0|-0.0690575837052633| null| null|\n| 2.0| USA|Truman|1950.0| 2.0| 1.0|66.0| 1| 0.0| 59.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.7080503| 5.332719|7.5663114| 0.0| 0.0| 0.0| -0.11372068300939| null| null|\n| 2.0| USA|Truman|1950.0| 3.0| 1.0|66.0| 1| 0.0| 60.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.7725887|5.3375382|7.5668287| 0.0| 0.0| 0.0| -0.108042069627093| null| null|\n| 2.0| USA|Truman|1950.0| 4.0| 1.0|66.0| 1| 0.0| 61.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.8332133|5.3423343|7.5673456| 0.0| 0.0| 0.0|-0.0416001452793281| null| null|\n| 2.0| USA|Truman|1950.0| 5.0| 1.0|66.0| 1| 0.0| 62.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.8903718|5.3471074|7.5678625| 0.0| 0.0| 0.0| -0.129702783937251| null| null|\n| 2.0| USA|Truman|1950.0| 6.0| 1.0|66.0| 1| 0.0| 63.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.944439| 5.351858|7.5683794| 0.0| 0.0| 0.0| -0.178496151195764| null| null|\n| 2.0| USA|Truman|1950.0| 7.0| 1.0|66.0| 1| 0.0| 64.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.9957323|5.3565865| 7.568896| 0.0| 0.0| 0.0| -0.042660054596682| null| null|\n| 2.0| USA|Truman|1950.0| 8.0| 1.0|66.0| 1| 0.0| 65.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 3.0445225|5.3612924|7.5694118| 0.0| 0.0| 0.0| -0.070590356102934| null| null|\n| 2.0| USA|Truman|1950.0| 9.0| 1.0|66.0| 1| 0.0| 66.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 3.0910425| 5.365976|7.5699277| 0.0| 0.0| 0.0| 0.0355567077070064| null| null|\n| 2.0| USA|Truman|1950.0| 10.0| 1.0|66.0| 1| 0.0| 67.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 3.1354942| 5.370638| 7.570443| 0.0| 0.0| 0.0| -0.138817795625302| null| null|\n+-----+-------+------+------+-----+-------+----+----+--------------+-------------+--------------------+------------+-------+-------+--------+--------------+------------+---------------+----------+-----------+-----------+----------+-------------+---------------+--------------+-------------+-------------+---------------+-------+------------+---------+---------+-------------+------+----------+-------------------+--------+-----------+\nonly showing top 10 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
+-----+-------+------+------+-----+-------+----+----+--------------+-------------+--------------------+------------+-------+-------+--------+--------------+------------+---------------+----------+-----------+-----------+----------+-------------+---------------+--------------+-------------+-------------+---------------+-------+------------+---------+---------+-------------+------+----------+-------------------+--------+-----------+\nccode|country|leader| year|month|elected| age|male|militarycareer|tenure_months| government|anticipation|ref_ant|leg_ant|exec_ant|irreg_lead_ant|election_now|election_recent|leg_recent|exec_recent|lead_recent|ref_recent|direct_recent|indirect_recent|victory_recent|defeat_recent|change_recent|nochange_recent|delayed|lastelection| loss|irregular|prev_conflict|pt_suc|pt_attempt| precip|couprisk|pctile_risk|\n+-----+-------+------+------+-----+-------+----+----+--------------+-------------+--------------------+------------+-------+-------+--------+--------------+------------+---------------+----------+-----------+-----------+----------+-------------+---------------+--------------+-------------+-------------+---------------+-------+------------+---------+---------+-------------+------+----------+-------------------+--------+-----------+\n 2.0| USA|Truman|1950.0| 1.0| 1.0|66.0| 1| 0.0| 58.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.6390574| 5.327876| 7.565793| 0.0| 0.0| 0.0|-0.0690575837052633| null| null|\n 2.0| USA|Truman|1950.0| 2.0| 1.0|66.0| 1| 0.0| 59.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.7080503| 5.332719|7.5663114| 0.0| 0.0| 0.0| -0.11372068300939| null| null|\n 2.0| USA|Truman|1950.0| 3.0| 1.0|66.0| 1| 0.0| 60.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.7725887|5.3375382|7.5668287| 0.0| 0.0| 0.0| -0.108042069627093| null| null|\n 2.0| USA|Truman|1950.0| 4.0| 1.0|66.0| 1| 0.0| 61.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.8332133|5.3423343|7.5673456| 0.0| 0.0| 0.0|-0.0416001452793281| null| null|\n 2.0| USA|Truman|1950.0| 5.0| 1.0|66.0| 1| 0.0| 62.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.8903718|5.3471074|7.5678625| 0.0| 0.0| 0.0| -0.129702783937251| null| null|\n 2.0| USA|Truman|1950.0| 6.0| 1.0|66.0| 1| 0.0| 63.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.944439| 5.351858|7.5683794| 0.0| 0.0| 0.0| -0.178496151195764| null| null|\n 2.0| USA|Truman|1950.0| 7.0| 1.0|66.0| 1| 0.0| 64.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 2.9957323|5.3565865| 7.568896| 0.0| 0.0| 0.0| -0.042660054596682| null| null|\n 2.0| USA|Truman|1950.0| 8.0| 1.0|66.0| 1| 0.0| 65.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 3.0445225|5.3612924|7.5694118| 0.0| 0.0| 0.0| -0.070590356102934| null| null|\n 2.0| USA|Truman|1950.0| 9.0| 1.0|66.0| 1| 0.0| 66.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 3.0910425| 5.365976|7.5699277| 0.0| 0.0| 0.0| 0.0355567077070064| null| null|\n 2.0| USA|Truman|1950.0| 10.0| 1.0|66.0| 1| 0.0| 67.0|Presidential Demo...| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 3.1354942| 5.370638| 7.570443| 0.0| 0.0| 0.0| -0.138817795625302| null| null|\n+-----+-------+------+------+-----+-------+----+----+--------------+-------------+--------------------+------------+-------+-------+--------+--------------+------------+---------------+----------+-----------+-----------+----------+-------------+---------------+--------------+-------------+-------------+---------------+-------+------------+---------+---------+-------------+------+----------+-------------------+--------+-----------+\nonly showing top 10 rows\n\n
"]}}],"execution_count":0},{"cell_type":"markdown","source":["This isn't very pleasant to look at; however, the advantage of using a Spark DataFrame here (even though this could fit on one node) is that it's distributed. Fortunately, we can use the display() function to output a cleaner dataframe."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"92bbaa43-9536-43f5-962f-1f68ba2e25ec"}}},{"cell_type":"code","source":["display(df.limit(7))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1b8b3cdf-421a-447a-9d73-f908d6495e92"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"overflow":false,"datasetInfos":[],"data":[[2.0,"USA","Truman",1950.0,1.0,1.0,66.0,1,0.0,58.0,"Presidential Democracy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.6390574,5.327876,7.565793,0.0,0.0,0.0,-0.0690575837052633,null,null],[2.0,"USA","Truman",1950.0,2.0,1.0,66.0,1,0.0,59.0,"Presidential Democracy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.7080503,5.332719,7.5663114,0.0,0.0,0.0,-0.11372068300939,null,null],[2.0,"USA","Truman",1950.0,3.0,1.0,66.0,1,0.0,60.0,"Presidential Democracy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.7725887,5.3375382,7.5668287,0.0,0.0,0.0,-0.108042069627093,null,null],[2.0,"USA","Truman",1950.0,4.0,1.0,66.0,1,0.0,61.0,"Presidential Democracy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.8332133,5.3423343,7.5673456,0.0,0.0,0.0,-0.0416001452793281,null,null],[2.0,"USA","Truman",1950.0,5.0,1.0,66.0,1,0.0,62.0,"Presidential Democracy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.8903718,5.3471074,7.5678625,0.0,0.0,0.0,-0.129702783937251,null,null],[2.0,"USA","Truman",1950.0,6.0,1.0,66.0,1,0.0,63.0,"Presidential Democracy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.944439,5.351858,7.5683794,0.0,0.0,0.0,-0.178496151195764,null,null],[2.0,"USA","Truman",1950.0,7.0,1.0,66.0,1,0.0,64.0,"Presidential Democracy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.9957323,5.3565865,7.568896,0.0,0.0,0.0,-0.042660054596682,null,null]],"plotOptions":{"displayType":"table","customPlotOptions":{},"pivotColumns":null,"pivotAggregation":null,"xColumns":null,"yColumns":null},"columnCustomDisplayInfos":{},"aggType":"","isJsonSchema":true,"removedWidgets":[],"aggSchema":[],"schema":[{"name":"ccode","type":"\"double\"","metadata":"{}"},{"name":"country","type":"\"string\"","metadata":"{}"},{"name":"leader","type":"\"string\"","metadata":"{}"},{"name":"year","type":"\"double\"","metadata":"{}"},{"name":"month","type":"\"double\"","metadata":"{}"},{"name":"elected","type":"\"double\"","metadata":"{}"},{"name":"age","type":"\"double\"","metadata":"{}"},{"name":"male","type":"\"integer\"","metadata":"{}"},{"name":"militarycareer","type":"\"double\"","metadata":"{}"},{"name":"tenure_months","type":"\"double\"","metadata":"{}"},{"name":"government","type":"\"string\"","metadata":"{}"},{"name":"anticipation","type":"\"double\"","metadata":"{}"},{"name":"ref_ant","type":"\"double\"","metadata":"{}"},{"name":"leg_ant","type":"\"double\"","metadata":"{}"},{"name":"exec_ant","type":"\"double\"","metadata":"{}"},{"name":"irreg_lead_ant","type":"\"double\"","metadata":"{}"},{"name":"election_now","type":"\"double\"","metadata":"{}"},{"name":"election_recent","type":"\"double\"","metadata":"{}"},{"name":"leg_recent","type":"\"double\"","metadata":"{}"},{"name":"exec_recent","type":"\"double\"","metadata":"{}"},{"name":"lead_recent","type":"\"double\"","metadata":"{}"},{"name":"ref_recent","type":"\"double\"","metadata":"{}"},{"name":"direct_recent","type":"\"double\"","metadata":"{}"},{"name":"indirect_recent","type":"\"double\"","metadata":"{}"},{"name":"victory_recent","type":"\"double\"","metadata":"{}"},{"name":"defeat_recent","type":"\"double\"","metadata":"{}"},{"name":"change_recent","type":"\"double\"","metadata":"{}"},{"name":"nochange_recent","type":"\"double\"","metadata":"{}"},{"name":"delayed","type":"\"double\"","metadata":"{}"},{"name":"lastelection","type":"\"double\"","metadata":"{}"},{"name":"loss","type":"\"double\"","metadata":"{}"},{"name":"irregular","type":"\"double\"","metadata":"{}"},{"name":"prev_conflict","type":"\"double\"","metadata":"{}"},{"name":"pt_suc","type":"\"double\"","metadata":"{}"},{"name":"pt_attempt","type":"\"double\"","metadata":"{}"},{"name":"precip","type":"\"double\"","metadata":"{}"},{"name":"couprisk","type":"\"double\"","metadata":"{}"},{"name":"pctile_risk","type":"\"double\"","metadata":"{}"}],"aggError":"","aggData":[],"addedWidgets":{},"metadata":{},"dbfsResultPath":null,"type":"table","aggOverflow":false,"aggSeriesLimitReached":false,"arguments":{}}},"output_type":"display_data","data":{"text/html":["
ccodecountryleaderyearmonthelectedagemalemilitarycareertenure_monthsgovernmentanticipationref_antleg_antexec_antirreg_lead_antelection_nowelection_recentleg_recentexec_recentlead_recentref_recentdirect_recentindirect_recentvictory_recentdefeat_recentchange_recentnochange_recentdelayedlastelectionlossirregularprev_conflictpt_sucpt_attemptprecipcoupriskpctile_risk
2.0USATruman1950.01.01.066.010.058.0Presidential Democracy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.02.63905745.3278767.5657930.00.00.0-0.0690575837052633nullnull
2.0USATruman1950.02.01.066.010.059.0Presidential Democracy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.02.70805035.3327197.56631140.00.00.0-0.11372068300939nullnull
2.0USATruman1950.03.01.066.010.060.0Presidential Democracy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.02.77258875.33753827.56682870.00.00.0-0.108042069627093nullnull
2.0USATruman1950.04.01.066.010.061.0Presidential Democracy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.02.83321335.34233437.56734560.00.00.0-0.0416001452793281nullnull
2.0USATruman1950.05.01.066.010.062.0Presidential Democracy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.02.89037185.34710747.56786250.00.00.0-0.129702783937251nullnull
2.0USATruman1950.06.01.066.010.063.0Presidential Democracy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.02.9444395.3518587.56837940.00.00.0-0.178496151195764nullnull
2.0USATruman1950.07.01.066.010.064.0Presidential Democracy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.02.99573235.35658657.5688960.00.00.0-0.042660054596682nullnull
"]}}],"execution_count":0},{"cell_type":"markdown","source":["Much better! Let's see how many partitions we have."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"67d7e537-58ea-49ae-ba3a-af3c147e5c9f"}}},{"cell_type":"code","source":["df.rdd.getNumPartitions()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4a15e917-eb4b-4a8f-b220-b16e28abd4fa"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
Out[26]: 8
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
Out[26]: 8
"]}}],"execution_count":0},{"cell_type":"markdown","source":["Wow! The data is split across 8 partitions! We can always change this number, but for this example we will leave it as 8.\n\nNext, we make some changes to the data to prepare it for analysis"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a9078ce6-21be-42e4-afe9-a7d5daa6547f"}}},{"cell_type":"code","source":["from pyspark.sql.functions import lit, lpad, col, concat, date_format\nfrom pyspark.sql.types import DateType\n\n\ndf = df.withColumn('country-leader', concat(col('country'),lit(\":\"),col('leader')))\ndf = df.withColumn('year-month', concat(col('year').cast('integer').cast('string'),lit(\"-\"), lpad(col('month').cast('integer').cast('string'), 2, \"0\"), lit(\"-\"),lit(\"01\")))\n\ndf = df.withColumn('year-month', df['year-month'].cast(DateType()))\n\ncols = ['country-leader', 'year-month'] + [x for x in df.columns if x not in [\"ccode\", \"country-leader\", \"leader\", \"year-month\"]]\ndf = df.select(cols)\ntotal_obs = df.count()\ndf = df.drop_duplicates(subset = [\"country-leader\", \"year-month\"])\nn_duplicates = total_obs - df.count()\nprint(f\"{n_duplicates} observations with a duplicated identifier pair deleted.\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"218c4ac8-a65d-4a77-954f-8e498a1c757b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
7 observations with a duplicated identifier pair deleted.\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
7 observations with a duplicated identifier pair deleted.\n
"]}}],"execution_count":0},{"cell_type":"markdown","source":["Now that we have created unique identifiers for the individual and time, we pass the data through the Panel Data Processor, specifying a value of 4 for 'TEST_INTERVALS' as we want to test the last 4 periods. For the time being, we transform the time_id back to a numeric feature given constraints regarding datetime functionality."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6b2a6e06-6592-4ad8-8a20-847b79478d0c"}}},{"cell_type":"code","source":["test_intervals = 4\nprocessor = PanelDataProcessor(data=df, config = {'TEST_INTERVALS': test_intervals}, shuffle_parts = 20)\nprocessor.build_processed_data()\ndisplay(processor.data.limit(7))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"9f2fba7d-3ab0-4abf-afaf-3250be4efd35"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
Time identifier column name not given; assumed to be second-leftmost column (year-month)\nIndividual identifier column name not given; assumed to be leftmost column (country-leader)\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
Time identifier column name not given; assumed to be second-leftmost column (year-month)\nIndividual identifier column name not given; assumed to be leftmost column (country-leader)\n
"]}},{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"overflow":false,"datasetInfos":[],"data":[["Afghanistan:Abdallah Yakta","1967-10-01","Afghanistan",1967.0,10.0,0.0,53.0,1,0.0,1.0,"Monarchy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.1246834,6.1246834,6.1246834,0.0,0.0,0.0,0.0187039816454769,null,null,213,false,false,true,629,0,1,true],["Afghanistan:Abdallah Yakta","1967-11-01","Afghanistan",1967.0,11.0,0.0,53.0,1,0.0,2.0,"Monarchy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.126869,6.126869,6.126869,0.0,0.0,0.0,0.17923993006129,null,null,214,false,false,true,628,0,0,true],["Afghanistan:Abdul Zahir","1971-06-01","Afghanistan",1971.0,6.0,0.0,61.0,1,0.0,1.0,"Monarchy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.216606,6.216606,6.216606,0.0,0.0,0.0,-1.80655395371697,null,null,257,false,false,false,585,0,18,true],["Afghanistan:Abdul Zahir","1971-07-01","Afghanistan",1971.0,7.0,0.0,61.0,1,0.0,2.0,"Monarchy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2186003,6.2186003,6.2186003,0.0,0.0,0.0,-1.79781203785734,null,null,258,false,false,false,584,0,17,true],["Afghanistan:Abdul Zahir","1971-08-01","Afghanistan",1971.0,8.0,0.0,61.0,1,0.0,3.0,"Monarchy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.22059,6.22059,6.22059,0.0,0.0,0.0,-1.81513438987383,null,null,259,false,false,false,583,0,16,true],["Afghanistan:Abdul Zahir","1971-09-01","Afghanistan",1971.0,9.0,0.0,61.0,1,0.0,4.0,"Monarchy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.222576,6.222576,6.222576,0.0,0.0,0.0,-1.84107666146101,null,null,260,false,false,false,582,0,15,true],["Afghanistan:Abdul Zahir","1971-10-01","Afghanistan",1971.0,10.0,0.0,61.0,1,0.0,5.0,"Monarchy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2245584,6.2245584,6.2245584,0.0,0.0,0.0,-1.90343827157157,null,null,261,false,false,false,581,0,14,true]],"plotOptions":{"displayType":"table","customPlotOptions":{},"pivotColumns":[],"pivotAggregation":null,"xColumns":[],"yColumns":[]},"columnCustomDisplayInfos":{},"aggType":"","isJsonSchema":true,"removedWidgets":[],"aggSchema":[],"schema":[{"name":"country-leader","type":"\"string\"","metadata":"{}"},{"name":"year-month","type":"\"date\"","metadata":"{}"},{"name":"country","type":"\"string\"","metadata":"{}"},{"name":"year","type":"\"double\"","metadata":"{}"},{"name":"month","type":"\"double\"","metadata":"{}"},{"name":"elected","type":"\"double\"","metadata":"{}"},{"name":"age","type":"\"double\"","metadata":"{}"},{"name":"male","type":"\"integer\"","metadata":"{}"},{"name":"militarycareer","type":"\"double\"","metadata":"{}"},{"name":"tenure_months","type":"\"double\"","metadata":"{}"},{"name":"government","type":"\"string\"","metadata":"{}"},{"name":"anticipation","type":"\"double\"","metadata":"{}"},{"name":"ref_ant","type":"\"double\"","metadata":"{}"},{"name":"leg_ant","type":"\"double\"","metadata":"{}"},{"name":"exec_ant","type":"\"double\"","metadata":"{}"},{"name":"irreg_lead_ant","type":"\"double\"","metadata":"{}"},{"name":"election_now","type":"\"double\"","metadata":"{}"},{"name":"election_recent","type":"\"double\"","metadata":"{}"},{"name":"leg_recent","type":"\"double\"","metadata":"{}"},{"name":"exec_recent","type":"\"double\"","metadata":"{}"},{"name":"lead_recent","type":"\"double\"","metadata":"{}"},{"name":"ref_recent","type":"\"double\"","metadata":"{}"},{"name":"direct_recent","type":"\"double\"","metadata":"{}"},{"name":"indirect_recent","type":"\"double\"","metadata":"{}"},{"name":"victory_recent","type":"\"double\"","metadata":"{}"},{"name":"defeat_recent","type":"\"double\"","metadata":"{}"},{"name":"change_recent","type":"\"double\"","metadata":"{}"},{"name":"nochange_recent","type":"\"double\"","metadata":"{}"},{"name":"delayed","type":"\"double\"","metadata":"{}"},{"name":"lastelection","type":"\"double\"","metadata":"{}"},{"name":"loss","type":"\"double\"","metadata":"{}"},{"name":"irregular","type":"\"double\"","metadata":"{}"},{"name":"prev_conflict","type":"\"double\"","metadata":"{}"},{"name":"pt_suc","type":"\"double\"","metadata":"{}"},{"name":"pt_attempt","type":"\"double\"","metadata":"{}"},{"name":"precip","type":"\"double\"","metadata":"{}"},{"name":"couprisk","type":"\"double\"","metadata":"{}"},{"name":"pctile_risk","type":"\"double\"","metadata":"{}"},{"name":"_period","type":"\"integer\"","metadata":"{}"},{"name":"_predict_obs","type":"\"boolean\"","metadata":"{}"},{"name":"_test","type":"\"boolean\"","metadata":"{}"},{"name":"_validation","type":"\"boolean\"","metadata":"{}"},{"name":"_maximum_lead","type":"\"integer\"","metadata":"{}"},{"name":"_spell","type":"\"long\"","metadata":"{}"},{"name":"_duration","type":"\"long\"","metadata":"{}"},{"name":"_event_observed","type":"\"boolean\"","metadata":"{}"}],"aggError":"","aggData":[],"addedWidgets":{},"metadata":{},"dbfsResultPath":null,"type":"table","aggOverflow":false,"aggSeriesLimitReached":false,"arguments":{}}},"output_type":"display_data","data":{"text/html":["
country-leaderyear-monthcountryyearmonthelectedagemalemilitarycareertenure_monthsgovernmentanticipationref_antleg_antexec_antirreg_lead_antelection_nowelection_recentleg_recentexec_recentlead_recentref_recentdirect_recentindirect_recentvictory_recentdefeat_recentchange_recentnochange_recentdelayedlastelectionlossirregularprev_conflictpt_sucpt_attemptprecipcoupriskpctile_risk_period_predict_obs_test_validation_maximum_lead_spell_duration_event_observed
Afghanistan:Abdallah Yakta1967-10-01Afghanistan1967.010.00.053.010.01.0Monarchy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.06.12468346.12468346.12468340.00.00.00.0187039816454769nullnull213falsefalsetrue62901true
Afghanistan:Abdallah Yakta1967-11-01Afghanistan1967.011.00.053.010.02.0Monarchy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.06.1268696.1268696.1268690.00.00.00.17923993006129nullnull214falsefalsetrue62800true
Afghanistan:Abdul Zahir1971-06-01Afghanistan1971.06.00.061.010.01.0Monarchy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.06.2166066.2166066.2166060.00.00.0-1.80655395371697nullnull257falsefalsefalse585018true
Afghanistan:Abdul Zahir1971-07-01Afghanistan1971.07.00.061.010.02.0Monarchy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.06.21860036.21860036.21860030.00.00.0-1.79781203785734nullnull258falsefalsefalse584017true
Afghanistan:Abdul Zahir1971-08-01Afghanistan1971.08.00.061.010.03.0Monarchy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.06.220596.220596.220590.00.00.0-1.81513438987383nullnull259falsefalsefalse583016true
Afghanistan:Abdul Zahir1971-09-01Afghanistan1971.09.00.061.010.04.0Monarchy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.06.2225766.2225766.2225760.00.00.0-1.84107666146101nullnull260falsefalsefalse582015true
Afghanistan:Abdul Zahir1971-10-01Afghanistan1971.010.00.061.010.05.0Monarchy0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.06.22455846.22455846.22455840.00.00.0-1.90343827157157nullnull261falsefalsefalse581014true
"]}}],"execution_count":0},{"cell_type":"markdown","source":["Now, we build the model. You can pass parameters into the model that will be passed to lightgbm as well."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"13860e67-79fa-4a11-8db3-8aa289431934"}}},{"cell_type":"code","source":["modeler = LGBSurvivalModeler(data=processor.data)\nmodeler.build_model(n_intervals=test_intervals)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7d57b970-74c9-4096-8247-d249cb1f4f17"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"markdown","source":["Now we want to see how well our model performs on the test data and take a look at the predictions"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"87e38942-9640-485e-b3c3-1f0bc573fde5"}}},{"cell_type":"code","source":["metrics = modeler.evaluate()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"687fae41-e346-40a8-a900-b8e992a6193f"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
Now evaluating lead length: 1 of 4\n2.2871768474578857\n70.40783166885376\n1.4782764911651611\n0.20274138450622559\n3.9578638076782227\nNow evaluating lead length: 2 of 4\n1.112673044204712\n88.06950092315674\n3.842557191848755\n0.30141425132751465\n9.198489665985107\nNow evaluating lead length: 3 of 4\n1.9427433013916016\n95.69927430152893\n2.6116440296173096\n0.323838472366333\n6.349407434463501\nNow evaluating lead length: 4 of 4\n1.8285760879516602\n91.48771524429321\n3.2123091220855713\n0.21567392349243164\n7.031679391860962\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
Now evaluating lead length: 1 of 4\n2.2871768474578857\n70.40783166885376\n1.4782764911651611\n0.20274138450622559\n3.9578638076782227\nNow evaluating lead length: 2 of 4\n1.112673044204712\n88.06950092315674\n3.842557191848755\n0.30141425132751465\n9.198489665985107\nNow evaluating lead length: 3 of 4\n1.9427433013916016\n95.69927430152893\n2.6116440296173096\n0.323838472366333\n6.349407434463501\nNow evaluating lead length: 4 of 4\n1.8285760879516602\n91.48771524429321\n3.2123091220855713\n0.21567392349243164\n7.031679391860962\n
"]}}],"execution_count":0},{"cell_type":"code","source":["metrics"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e2776163-99f9-43ee-8626-2da3359299c9"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
Out[31]:
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
Out[31]:
"]}},{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AUROCActual SharePredicted ShareTrue PositivesFalse NegativesFalse PositivesTrue NegativesOther Metrics:
Lead Length
10.8835050.9748740.988642194041
20.9322920.9648240.969939190252
30.8883040.9547740.958899188272
40.8366420.9346730.9384731833103
\n
","textData":null,"removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AUROCActual SharePredicted ShareTrue PositivesFalse NegativesFalse PositivesTrue NegativesOther Metrics:
Lead Length
10.8835050.9748740.988642194041
20.9322920.9648240.969939190252
30.8883040.9547740.958899188272
40.8366420.9346730.9384731833103
\n
"]}}],"execution_count":0},{"cell_type":"markdown","source":["And finally, we want to forecast future survival probabilities for country-leaders in the last period"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"cbb4f7b2-9068-4b10-8eb5-423dc0f7081a"}}},{"cell_type":"code","source":["forecasts = modeler.forecast()\nforecasts.head(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4cef3779-3256-43b1-a736-6af1d59b8187"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
Out[32]:
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
Out[32]:
"]}},{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
1-period Survival Probability2-period Survival Probability3-period Survival Probability4-period Survival Probability
00.9970370.9967150.9951110.992604
10.9943770.9902830.9866370.977831
20.9839840.9784640.9717960.954448
30.9930380.9772670.9669080.964620
40.9987900.9974960.9955010.993531
50.9896120.9877220.9844500.968102
60.9947470.9777370.9765690.974783
70.9810610.9802910.9766300.968736
80.9893490.9865570.9796860.976169
90.9861990.4527830.4479130.440717
\n
","textData":null,"removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
1-period Survival Probability2-period Survival Probability3-period Survival Probability4-period Survival Probability
00.9970370.9967150.9951110.992604
10.9943770.9902830.9866370.977831
20.9839840.9784640.9717960.954448
30.9930380.9772670.9669080.964620
40.9987900.9974960.9955010.993531
50.9896120.9877220.9844500.968102
60.9947470.9777370.9765690.974783
70.9810610.9802910.9766300.968736
80.9893490.9865570.9796860.976169
90.9861990.4527830.4479130.440717
\n
"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f2e7bd1c-35b0-4fe4-996a-d468f2ce83f0"}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"example_reign_notebook","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":1405971115483140}},"nbformat":4,"nbformat_minor":0}