{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Infobox Counts from XML Dumps\n", "- extract infobox data from dump files\n", "- create and populate `population_infobox` table\n", "- report infobox counts" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',\n", " 'enwiki-20190420-pages-articles-multistream.xml.bz2']\n", "\n", "INFOBOX_REGEX = re.compile(r'\\{ *infobox ', re.IGNORECASE)\n", "\n", "def count_infobox(entity, date):\n", " page_text = entity.revision.text._VALUE\n", " count = len(INFOBOX_REGEX.findall(page_text))\n", " return Row(page_id=entity.id, infobox_count=count, dt=date)\n", "\n", "infobox_rdd = sc.emptyRDD()\n", "for file in WIKIPEDIA_XML_DUMPS:\n", " wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)\n", " dump_date = re.search(r'.*(\\d{8}).*',file).group(1)\n", " articles = wikipedia\\\n", " .filter(\"ns = '0'\")\\\n", " .filter(\"redirect._title is null\") \\\n", " .filter(\"revision.text._VALUE is not null\") \\\n", " .filter(\"length(revision.text._VALUE) > 0\")\n", " daily_counts = sqlContext.createDataFrame(articles.rdd.map(lambda entity: count_infobox(entity, dump_date)))\n", " infobox_rdd = infobox_rdd.union(daily_counts.rdd)\n", "\n", "infobox_merged = sqlContext.createDataFrame(infobox_rdd)\n", "infobox = infobox_merged.toPandas()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | dt | \n", "infobox_count | \n", "page_id | \n", "
|---|---|---|---|
| 0 | \n", "20190401 | \n", "0 | \n", "12 | \n", "
| 1 | \n", "20190401 | \n", "1 | \n", "25 | \n", "
| 2 | \n", "20190401 | \n", "0 | \n", "39 | \n", "
| 3 | \n", "20190401 | \n", "1 | \n", "290 | \n", "
| 4 | \n", "20190401 | \n", "2 | \n", "303 | \n", "
| 5 | \n", "20190401 | \n", "0 | \n", "305 | \n", "
| 6 | \n", "20190401 | \n", "2 | \n", "307 | \n", "
| 7 | \n", "20190401 | \n", "1 | \n", "308 | \n", "
| 8 | \n", "20190401 | \n", "0 | \n", "309 | \n", "
| 9 | \n", "20190401 | \n", "1 | \n", "316 | \n", "
| 10 | \n", "20190401 | \n", "2 | \n", "324 | \n", "
| 11 | \n", "20190401 | \n", "1 | \n", "330 | \n", "
| 12 | \n", "20190401 | \n", "2 | \n", "332 | \n", "
| 13 | \n", "20190401 | \n", "0 | \n", "334 | \n", "
| 14 | \n", "20190401 | \n", "0 | \n", "336 | \n", "
| 15 | \n", "20190401 | \n", "1 | \n", "339 | \n", "
| 16 | \n", "20190401 | \n", "1 | \n", "340 | \n", "
| 17 | \n", "20190401 | \n", "1 | \n", "344 | \n", "
| 18 | \n", "20190401 | \n", "1 | \n", "358 | \n", "
| 19 | \n", "20190401 | \n", "0 | \n", "359 | \n", "
| 20 | \n", "20190401 | \n", "0 | \n", "569 | \n", "
| 21 | \n", "20190401 | \n", "0 | \n", "572 | \n", "
| 22 | \n", "20190401 | \n", "0 | \n", "573 | \n", "
| 23 | \n", "20190401 | \n", "0 | \n", "579 | \n", "
| 24 | \n", "20190401 | \n", "0 | \n", "580 | \n", "
| 25 | \n", "20190401 | \n", "1 | \n", "586 | \n", "
| 26 | \n", "20190401 | \n", "0 | \n", "590 | \n", "
| 27 | \n", "20190401 | \n", "0 | \n", "593 | \n", "
| 28 | \n", "20190401 | \n", "1 | \n", "594 | \n", "
| 29 | \n", "20190401 | \n", "1 | \n", "595 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 11686877 | \n", "20190420 | \n", "1 | \n", "60552524 | \n", "
| 11686878 | \n", "20190420 | \n", "1 | \n", "60552528 | \n", "
| 11686879 | \n", "20190420 | \n", "0 | \n", "60552534 | \n", "
| 11686880 | \n", "20190420 | \n", "0 | \n", "60552561 | \n", "
| 11686881 | \n", "20190420 | \n", "1 | \n", "60552576 | \n", "
| 11686882 | \n", "20190420 | \n", "1 | \n", "60552592 | \n", "
| 11686883 | \n", "20190420 | \n", "1 | \n", "60552597 | \n", "
| 11686884 | \n", "20190420 | \n", "0 | \n", "60552601 | \n", "
| 11686885 | \n", "20190420 | \n", "1 | \n", "60552606 | \n", "
| 11686886 | \n", "20190420 | \n", "0 | \n", "60552607 | \n", "
| 11686887 | \n", "20190420 | \n", "1 | \n", "60552611 | \n", "
| 11686888 | \n", "20190420 | \n", "1 | \n", "60552619 | \n", "
| 11686889 | \n", "20190420 | \n", "0 | \n", "60552630 | \n", "
| 11686890 | \n", "20190420 | \n", "1 | \n", "60552632 | \n", "
| 11686891 | \n", "20190420 | \n", "1 | \n", "60552643 | \n", "
| 11686892 | \n", "20190420 | \n", "1 | \n", "60552649 | \n", "
| 11686893 | \n", "20190420 | \n", "1 | \n", "60552656 | \n", "
| 11686894 | \n", "20190420 | \n", "1 | \n", "60552665 | \n", "
| 11686895 | \n", "20190420 | \n", "1 | \n", "60552674 | \n", "
| 11686896 | \n", "20190420 | \n", "1 | \n", "60552678 | \n", "
| 11686897 | \n", "20190420 | \n", "1 | \n", "60552679 | \n", "
| 11686898 | \n", "20190420 | \n", "1 | \n", "60552689 | \n", "
| 11686899 | \n", "20190420 | \n", "1 | \n", "60552690 | \n", "
| 11686900 | \n", "20190420 | \n", "1 | \n", "60552709 | \n", "
| 11686901 | \n", "20190420 | \n", "1 | \n", "60552711 | \n", "
| 11686902 | \n", "20190420 | \n", "1 | \n", "60552724 | \n", "
| 11686903 | \n", "20190420 | \n", "1 | \n", "60552730 | \n", "
| 11686904 | \n", "20190420 | \n", "0 | \n", "60552736 | \n", "
| 11686905 | \n", "20190420 | \n", "1 | \n", "60552745 | \n", "
| 11686906 | \n", "20190420 | \n", "1 | \n", "60552764 | \n", "
11686907 rows × 3 columns
\n", "| \n", " | dt | \n", "infobox_count | \n", "page_id | \n", "
|---|---|---|---|
| 5011 | \n", "20190401 | \n", "49 | \n", "10577 | \n", "
| 6537 | \n", "20190401 | \n", "58 | \n", "13696 | \n", "
| 49147 | \n", "20190401 | \n", "34 | \n", "102962 | \n", "
| 49165 | \n", "20190401 | \n", "27 | \n", "102985 | \n", "
| 49565 | \n", "20190401 | \n", "29 | \n", "103782 | \n", "
| 162390 | \n", "20190401 | \n", "52 | \n", "373334 | \n", "
| 164886 | \n", "20190401 | \n", "34 | \n", "381506 | \n", "
| 169633 | \n", "20190401 | \n", "28 | \n", "397295 | \n", "
| 276936 | \n", "20190401 | \n", "78 | \n", "782670 | \n", "
| 285251 | \n", "20190401 | \n", "32 | \n", "841648 | \n", "
| 293703 | \n", "20190401 | \n", "239 | \n", "871024 | \n", "
| 309385 | \n", "20190401 | \n", "34 | \n", "931088 | \n", "
| 324731 | \n", "20190401 | \n", "27 | \n", "996290 | \n", "
| 346730 | \n", "20190401 | \n", "34 | \n", "1086718 | \n", "
| 350842 | \n", "20190401 | \n", "35 | \n", "1104268 | \n", "
| 350865 | \n", "20190401 | \n", "36 | \n", "1104332 | \n", "
| 350906 | \n", "20190401 | \n", "35 | \n", "1104505 | \n", "
| 350910 | \n", "20190401 | \n", "35 | \n", "1104515 | \n", "
| 350918 | \n", "20190401 | \n", "38 | \n", "1104564 | \n", "
| 351288 | \n", "20190401 | \n", "35 | \n", "1106128 | \n", "
| 351294 | \n", "20190401 | \n", "33 | \n", "1106178 | \n", "
| 351311 | \n", "20190401 | \n", "33 | \n", "1106241 | \n", "
| 351326 | \n", "20190401 | \n", "26 | \n", "1106299 | \n", "
| 351346 | \n", "20190401 | \n", "29 | \n", "1106375 | \n", "
| 351506 | \n", "20190401 | \n", "28 | \n", "1106937 | \n", "
| 357754 | \n", "20190401 | \n", "34 | \n", "1130287 | \n", "
| 358597 | \n", "20190401 | \n", "36 | \n", "1133844 | \n", "
| 358688 | \n", "20190401 | \n", "39 | \n", "1134236 | \n", "
| 506704 | \n", "20190401 | \n", "28 | \n", "1847173 | \n", "
| 526287 | \n", "20190401 | \n", "80 | \n", "1945597 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 11306108 | \n", "20190420 | \n", "31 | \n", "55569842 | \n", "
| 11308223 | \n", "20190420 | \n", "35 | \n", "55595865 | \n", "
| 11309929 | \n", "20190420 | \n", "123 | \n", "55619912 | \n", "
| 11315945 | \n", "20190420 | \n", "179 | \n", "55697297 | \n", "
| 11321659 | \n", "20190420 | \n", "26 | \n", "55763651 | \n", "
| 11325516 | \n", "20190420 | \n", "30 | \n", "55809734 | \n", "
| 11332444 | \n", "20190420 | \n", "27 | \n", "55887267 | \n", "
| 11342183 | \n", "20190420 | \n", "28 | \n", "56002822 | \n", "
| 11353183 | \n", "20190420 | \n", "119 | \n", "56142356 | \n", "
| 11357978 | \n", "20190420 | \n", "129 | \n", "56192995 | \n", "
| 11371695 | \n", "20190420 | \n", "145 | \n", "56363828 | \n", "
| 11377401 | \n", "20190420 | \n", "142 | \n", "56434290 | \n", "
| 11385423 | \n", "20190420 | \n", "130 | \n", "56532039 | \n", "
| 11385904 | \n", "20190420 | \n", "126 | \n", "56538063 | \n", "
| 11389134 | \n", "20190420 | \n", "136 | \n", "56577813 | \n", "
| 11413632 | \n", "20190420 | \n", "334 | \n", "56887146 | \n", "
| 11414205 | \n", "20190420 | \n", "145 | \n", "56891883 | \n", "
| 11435248 | \n", "20190420 | \n", "28 | \n", "57147301 | \n", "
| 11472465 | \n", "20190420 | \n", "139 | \n", "57437661 | \n", "
| 11475831 | \n", "20190420 | \n", "122 | \n", "57482185 | \n", "
| 11479726 | \n", "20190420 | \n", "141 | \n", "57536902 | \n", "
| 11485287 | \n", "20190420 | \n", "136 | \n", "57620451 | \n", "
| 11486673 | \n", "20190420 | \n", "146 | \n", "57637971 | \n", "
| 11492147 | \n", "20190420 | \n", "128 | \n", "57706926 | \n", "
| 11501751 | \n", "20190420 | \n", "126 | \n", "57839087 | \n", "
| 11506322 | \n", "20190420 | \n", "71 | \n", "57897975 | \n", "
| 11520191 | \n", "20190420 | \n", "27 | \n", "58072995 | \n", "
| 11537552 | \n", "20190420 | \n", "30 | \n", "58344889 | \n", "
| 11564773 | \n", "20190420 | \n", "75 | \n", "58742596 | \n", "
| 11652475 | \n", "20190420 | \n", "43 | \n", "60089397 | \n", "
729 rows × 3 columns
\n", "| \n", " | pages_w_infobox | \n", "extract_date | \n", "
|---|---|---|
| 0 | \n", "5839083 | \n", "20190401 | \n", "
| 1 | \n", "5847824 | \n", "20190420 | \n", "