{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import pandas as pd\n", "import xml.etree.ElementTree as ET" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# XML 1" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xml_url = 'https://www.w3schools.com/xml/cd_catalog.xml'\n", "\n", "# Read XML from URL...\n", "xml_response = requests.get(xml_url, timeout=5)\n", "my_xml_root = ET.fromstring(xml_response.text)\n", "\n", "\n", "# Parse XML from str...\n", "# my_xml_root = ET.fromstring(xml_data) # to read from file: ET.parse('sample.xml').getroot()\n", "\n", "\n", "# # Parse XML from file...\n", "# my_xml_root = ET.parse(xml_file).getroot()\n", "\n", "\n", "my_xml_root" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(,\n", " ,\n", " )" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_xml_root1, my_xml_root2, my_xml_root3" ] }, { "cell_type": "code", "execution_count": 201, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TITLE {} Empire Burlesque\n", "ARTIST {} Bob Dylan\n", "COUNTRY {} USA\n", "COMPANY {} Columbia\n", "PRICE {} 10.90\n", "YEAR {} 1985\n", "Empire Burlesque\n", "Bob Dylan\n", "USA\n", "Columbia\n", "10.90\n", "1985\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleArtistCountryCompanyPriceYear
0Empire BurlesqueBob DylanUSAColumbia10.901985
1Hide your heartBonnie TylerUKCBS Records9.901988
2Greatest HitsDolly PartonUSARCA9.901982
3Still got the bluesGary MooreUKVirgin records10.201990
4ErosEros RamazzottiEUBMG9.901997
5One night onlyBee GeesUKPolydor10.901998
6Sylvias MotherDr.HookUKCBS8.101973
7Maggie MayRod StewartUKPickwick8.501990
8RomanzaAndrea BocelliEUPolydor10.801996
9When a man loves a womanPercy SledgeUSAAtlantic8.701987
10Black angelSavage RoseEUMega10.901995
111999 Grammy NomineesManyUSAGrammy10.201999
12For the good timesKenny RogersUKMucik Master8.701995
13Big Willie styleWill SmithUSAColumbia9.901997
14Tupelo HoneyVan MorrisonUKPolydor8.201971
15SoulsvilleJorn HoelNorwayWEA7.901996
16The very best ofCat StevensUKIsland8.901990
17StopSam BrownUKA and M8.901988
18Bridge of SpiesT'PauUKSiren7.901987
19Private DancerTina TurnerUKCapitol8.901983
20Midt om nattenKim LarsenEUMedley7.801983
21Pavarotti Gala ConcertLuciano PavarottiUKDECCA9.901991
22The dock of the bayOtis ReddingUSAStax Records7.901968
23Picture bookSimply RedEUElektra7.201985
24RedThe CommunardsUKLondon7.801987
25Unchain my heartJoe CockerUSAEMI8.201987
\n", "
" ], "text/plain": [ " Title Artist Country Company \\\n", "0 Empire Burlesque Bob Dylan USA Columbia \n", "1 Hide your heart Bonnie Tyler UK CBS Records \n", "2 Greatest Hits Dolly Parton USA RCA \n", "3 Still got the blues Gary Moore UK Virgin records \n", "4 Eros Eros Ramazzotti EU BMG \n", "5 One night only Bee Gees UK Polydor \n", "6 Sylvias Mother Dr.Hook UK CBS \n", "7 Maggie May Rod Stewart UK Pickwick \n", "8 Romanza Andrea Bocelli EU Polydor \n", "9 When a man loves a woman Percy Sledge USA Atlantic \n", "10 Black angel Savage Rose EU Mega \n", "11 1999 Grammy Nominees Many USA Grammy \n", "12 For the good times Kenny Rogers UK Mucik Master \n", "13 Big Willie style Will Smith USA Columbia \n", "14 Tupelo Honey Van Morrison UK Polydor \n", "15 Soulsville Jorn Hoel Norway WEA \n", "16 The very best of Cat Stevens UK Island \n", "17 Stop Sam Brown UK A and M \n", "18 Bridge of Spies T'Pau UK Siren \n", "19 Private Dancer Tina Turner UK Capitol \n", "20 Midt om natten Kim Larsen EU Medley \n", "21 Pavarotti Gala Concert Luciano Pavarotti UK DECCA \n", "22 The dock of the bay Otis Redding USA Stax Records \n", "23 Picture book Simply Red EU Elektra \n", "24 Red The Communards UK London \n", "25 Unchain my heart Joe Cocker USA EMI \n", "\n", " Price Year \n", "0 10.90 1985 \n", "1 9.90 1988 \n", "2 9.90 1982 \n", "3 10.20 1990 \n", "4 9.90 1997 \n", "5 10.90 1998 \n", "6 8.10 1973 \n", "7 8.50 1990 \n", "8 10.80 1996 \n", "9 8.70 1987 \n", "10 10.90 1995 \n", "11 10.20 1999 \n", "12 8.70 1995 \n", "13 9.90 1997 \n", "14 8.20 1971 \n", "15 7.90 1996 \n", "16 8.90 1990 \n", "17 8.90 1988 \n", "18 7.90 1987 \n", "19 8.90 1983 \n", "20 7.80 1983 \n", "21 9.90 1991 \n", "22 7.90 1968 \n", "23 7.20 1985 \n", "24 7.80 1987 \n", "25 8.20 1987 " ] }, "execution_count": 201, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The root element is only one in this case and it has no attributes as seen below...\n", "my_xml_root.tag # 'CATALOG'\n", "my_xml_root.attrib # empty dict {}\n", "\n", "\n", "# The first child...\n", "my_xml_root[0]\n", "\n", "for x in my_xml_root[0]:\n", " print(x.tag, x.attrib, x.text)\n", "# =========================================\n", "\n", " \n", "# Now lets get all children as well as there text attributes\n", "all_children = list(my_xml_root) # Old way is: my_xml_root.getchildren()\n", "\n", "for child in all_children:\n", " print(child.find('TITLE').text)\n", " print(child.find('ARTIST').text)\n", " print(child.find('COUNTRY').text)\n", " print(child.find('COMPANY').text)\n", " print(child.find('PRICE').text)\n", " print(child.find('YEAR').text)\n", " \n", " break # only print the first record\n", "# =========================================\n", "\n", "\n", "\n", "# Now we can collect the records into list/dictionary to construct a friend dataframe structure\n", "data_list = []\n", "for child in all_children:\n", " data_dict = {}\n", " \n", " data_dict['Title'] = child.find('TITLE').text\n", " data_dict['Artist'] = child.find('ARTIST').text\n", " data_dict['Country'] = child.find('COUNTRY').text\n", " data_dict['Company'] = child.find('COMPANY').text\n", " data_dict['Price'] = child.find('PRICE').text\n", " data_dict['Year'] = child.find('YEAR').text\n", " \n", " data_list.append(data_dict)\n", "# =========================================\n", "\n", " \n", "\n", "# Generate the df...\n", "data_list_df = pd.DataFrame(data_list)\n", "data_list_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# XML 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'b: 495419K, c: 3D NETWORKS SDN. BHD., d: 100,291, e: TA-9-1 LEVEL 9, TOWER A PLAZA 33, f: NO 1, JALAN KEMAJUAN, g: SEKSYEN 13 SELANGOR, h: PETALING JAYA, i: 46200, j: [NULL], k: Selangor, l: 60379403868, m: 60379403888, n: irene.lim@planetone.asia.com, o: Services, p: 62010, q: COMPUTER INDUSTRY, r: 2000-07-11-00.00.00.000000, s: 20, t: 64, u: 64, v: M, w: Y'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "new_xml_url = 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/ciyn3'\n", "\n", "# Read XML from URL...\n", "xml_response = requests.get(new_xml_url, timeout=5)\n", "my_xml_root = ET.fromstring(xml_response.text)\n", "# ------------------------\n", "\n", "my_xml_root.attrib\n", "my_xml_root.tag\n", "\n", "# Get all children...\n", "all_children = list(my_xml_root)\n", "\n", "# the items we wanted are in the fifth child...\n", "all_children[4].tag\n", "all_children[4].attrib\n", "\n", "content_text = all_children[4].text\n", "content_text_list = content_text.split(', ')\n", "\n", "content_text_list" ] }, { "cell_type": "code", "execution_count": 250, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/plain": [ "['495419K',\n", " '3D NETWORKS SDN. BHD.',\n", " '100,291',\n", " 'TA-9-1 LEVEL 9',\n", " 'TOWER A PLAZA 33',\n", " 'NO 1',\n", " 'JALAN KEMAJUAN',\n", " 'SEKSYEN 13 SELANGOR',\n", " 'PETALING JAYA',\n", " '46200',\n", " '[NULL]',\n", " 'Selangor',\n", " '60379403868',\n", " '60379403888',\n", " 'irene.lim@planetone.asia.com',\n", " 'Services',\n", " '62010',\n", " 'COMPUTER INDUSTRY',\n", " '2000-07-11-00.00.00.000000',\n", " '20',\n", " '64',\n", " '64',\n", " 'M',\n", " 'Y']" ] }, "execution_count": 250, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "\n", "# Template: re.sub(regex_pattern, new_str, original_str)\n", "[ re.sub(r'[a-z]: ', '', x_string) for x_string in content_text_list ]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing... https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cokwr\n", "Finished....\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...13141516171819202122
0877335W1 SECURITY SERVICES SDN. BHD.107,506NO 65A-1LORONG HARUAN 5/3OAKLAND COMMERCIAL SQUARENEGERI SEMBILANSEREMBAN70300[NULL]...info@1security.com.myServices80100SECURITY FIRMS2011-02-24-00.00.00.000000305555MY
\n", "

1 rows × 23 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 \\\n", "0 877335W 1 SECURITY SERVICES SDN. BHD. 107,506 NO 65A-1 \n", "\n", " 4 5 6 7 \\\n", "0 LORONG HARUAN 5/3 OAKLAND COMMERCIAL SQUARE NEGERI SEMBILAN SEREMBAN \n", "\n", " 8 9 ... 13 14 15 16 \\\n", "0 70300 [NULL] ... info@1security.com.my Services 80100 SECURITY FIRMS \n", "\n", " 17 18 19 20 21 22 \n", "0 2011-02-24-00.00.00.000000 30 55 55 M Y \n", "\n", "[1 rows x 23 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import requests\n", "import pandas as pd\n", "import xml.etree.ElementTree as ET\n", "\n", "\n", "xml_list = ['https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cokwr', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cpzh4', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cre1l', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/chk2m', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/ciyn3', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/ckd7g', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/clrrx', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cyevm', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cztg3', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/d180g', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/d2mkx', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cssly', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cu76f', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cvlqs', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/cx0b9', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/d9ney', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/db1zf', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/dcgjs', 'https://spreadsheets.google.com/feeds/list/17beU3LK_ZJEp4m_WKb2D8xDyX-6_MmmgU8BAnWf4NM0/o8b7v7h/public/values/ddv49']\n", "\n", "data_list = []\n", "for url in xml_list:\n", " print('Processing...', url)\n", " \n", " # Read XML from URL...\n", " xml_response = requests.get(url, timeout=5)\n", " my_xml_root = ET.fromstring(xml_response.text)\n", " # ------------------------\n", " \n", " # Get all children...\n", " all_children = list(my_xml_root)\n", " \n", " # the items we wanted are in the fifth child...\n", " all_children[4].tag\n", " all_children[4].attrib\n", "\n", " content_text = all_children[4].text\n", " content_text_list = content_text.split(', ')\n", " \n", " # Template: re.sub(regex_pattern, new_str, original_str)\n", " content = [ re.sub(r'[a-z]: ', '', x_string) for x_string in content_text_list ]\n", " \n", " data_list.append(content)\n", "\n", " # break\n", "\n", "print('Finished....')\n", "\n", "# Generate the df...\n", "data_list_df = pd.DataFrame(data_list)\n", "data_list_df" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "scrolled": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# XML 3" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "COMMON {} Bloodroot\n", "BOTANICAL {} Sanguinaria canadensis\n", "ZONE {} 4\n", "LIGHT {} Mostly Shady\n", "PRICE {} $2.44\n", "AVAILABILITY {} 031599\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CommonBotanicalZoneLightPriceAvailability
0BloodrootSanguinaria canadensis4Mostly Shady$2.44031599
1ColumbineAquilegia canadensis3Mostly Shady$9.37030699
2Marsh MarigoldCaltha palustris4Mostly Sunny$6.81051799
3CowslipCaltha palustris4Mostly Shady$9.90030699
4Dutchman's-BreechesDicentra cucullaria3Mostly Shady$6.44012099
5Ginger, WildAsarum canadense3Mostly Shady$9.03041899
6HepaticaHepatica americana4Mostly Shady$4.45012699
7LiverleafHepatica americana4Mostly Shady$3.99010299
8Jack-In-The-PulpitArisaema triphyllum4Mostly Shady$3.23020199
9MayapplePodophyllum peltatum3Mostly Shady$2.98060599
10Phlox, WoodlandPhlox divaricata3Sun or Shade$2.80012299
11Phlox, BluePhlox divaricata3Sun or Shade$5.59021699
12Spring-BeautyClaytonia Virginica7Mostly Shady$6.59020199
13TrilliumTrillium grandiflorum5Sun or Shade$3.90042999
14Wake RobinTrillium grandiflorum5Sun or Shade$3.20022199
15Violet, Dog-ToothErythronium americanum4Shade$9.04020199
16Trout LilyErythronium americanum4Shade$6.94032499
17Adder's-TongueErythronium americanum4Shade$9.58041399
18AnemoneAnemone blanda6Mostly Shady$8.86122698
19Grecian WindflowerAnemone blanda6Mostly Shady$9.16071099
20Bee BalmMonarda didyma4Shade$4.59050399
21BergamotMonarda didyma4Shade$7.16042799
22Black-Eyed SusanRudbeckia hirtaAnnualSunny$9.80061899
23ButtercupRanunculus4Shade$2.57061099
24CrowfootRanunculus4Shade$9.34040399
25Butterfly WeedAsclepias tuberosaAnnualSunny$2.78063099
26CinquefoilPotentillaAnnualShade$7.06052599
27PrimroseOenothera3 - 5Sunny$6.56013099
28GentianGentiana4Sun or Shade$7.81051899
29Blue GentianGentiana4Sun or Shade$8.56050299
30Jacob's LadderPolemonium caeruleumAnnualShade$9.26022199
31Greek ValerianPolemonium caeruleumAnnualShade$4.36071499
32California PoppyEschscholzia californicaAnnualSun$7.89032799
33Shooting StarDodecatheonAnnualMostly Shady$8.60051399
34SnakerootCimicifugaAnnualShade$5.63071199
35Cardinal FlowerLobelia cardinalis2Shade$3.02022299
\n", "
" ], "text/plain": [ " Common Botanical Zone Light \\\n", "0 Bloodroot Sanguinaria canadensis 4 Mostly Shady \n", "1 Columbine Aquilegia canadensis 3 Mostly Shady \n", "2 Marsh Marigold Caltha palustris 4 Mostly Sunny \n", "3 Cowslip Caltha palustris 4 Mostly Shady \n", "4 Dutchman's-Breeches Dicentra cucullaria 3 Mostly Shady \n", "5 Ginger, Wild Asarum canadense 3 Mostly Shady \n", "6 Hepatica Hepatica americana 4 Mostly Shady \n", "7 Liverleaf Hepatica americana 4 Mostly Shady \n", "8 Jack-In-The-Pulpit Arisaema triphyllum 4 Mostly Shady \n", "9 Mayapple Podophyllum peltatum 3 Mostly Shady \n", "10 Phlox, Woodland Phlox divaricata 3 Sun or Shade \n", "11 Phlox, Blue Phlox divaricata 3 Sun or Shade \n", "12 Spring-Beauty Claytonia Virginica 7 Mostly Shady \n", "13 Trillium Trillium grandiflorum 5 Sun or Shade \n", "14 Wake Robin Trillium grandiflorum 5 Sun or Shade \n", "15 Violet, Dog-Tooth Erythronium americanum 4 Shade \n", "16 Trout Lily Erythronium americanum 4 Shade \n", "17 Adder's-Tongue Erythronium americanum 4 Shade \n", "18 Anemone Anemone blanda 6 Mostly Shady \n", "19 Grecian Windflower Anemone blanda 6 Mostly Shady \n", "20 Bee Balm Monarda didyma 4 Shade \n", "21 Bergamot Monarda didyma 4 Shade \n", "22 Black-Eyed Susan Rudbeckia hirta Annual Sunny \n", "23 Buttercup Ranunculus 4 Shade \n", "24 Crowfoot Ranunculus 4 Shade \n", "25 Butterfly Weed Asclepias tuberosa Annual Sunny \n", "26 Cinquefoil Potentilla Annual Shade \n", "27 Primrose Oenothera 3 - 5 Sunny \n", "28 Gentian Gentiana 4 Sun or Shade \n", "29 Blue Gentian Gentiana 4 Sun or Shade \n", "30 Jacob's Ladder Polemonium caeruleum Annual Shade \n", "31 Greek Valerian Polemonium caeruleum Annual Shade \n", "32 California Poppy Eschscholzia californica Annual Sun \n", "33 Shooting Star Dodecatheon Annual Mostly Shady \n", "34 Snakeroot Cimicifuga Annual Shade \n", "35 Cardinal Flower Lobelia cardinalis 2 Shade \n", "\n", " Price Availability \n", "0 $2.44 031599 \n", "1 $9.37 030699 \n", "2 $6.81 051799 \n", "3 $9.90 030699 \n", "4 $6.44 012099 \n", "5 $9.03 041899 \n", "6 $4.45 012699 \n", "7 $3.99 010299 \n", "8 $3.23 020199 \n", "9 $2.98 060599 \n", "10 $2.80 012299 \n", "11 $5.59 021699 \n", "12 $6.59 020199 \n", "13 $3.90 042999 \n", "14 $3.20 022199 \n", "15 $9.04 020199 \n", "16 $6.94 032499 \n", "17 $9.58 041399 \n", "18 $8.86 122698 \n", "19 $9.16 071099 \n", "20 $4.59 050399 \n", "21 $7.16 042799 \n", "22 $9.80 061899 \n", "23 $2.57 061099 \n", "24 $9.34 040399 \n", "25 $2.78 063099 \n", "26 $7.06 052599 \n", "27 $6.56 013099 \n", "28 $7.81 051899 \n", "29 $8.56 050299 \n", "30 $9.26 022199 \n", "31 $4.36 071499 \n", "32 $7.89 032799 \n", "33 $8.60 051399 \n", "34 $5.63 071199 \n", "35 $3.02 022299 " ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://www.w3schools.com/xml/plant_catalog.xml'\n", "\n", "# Read XML from URL...\n", "xml_response = requests.get(url, timeout=5)\n", "my_xml_root = ET.fromstring(xml_response.text)\n", "\n", "# Get all children...\n", "all_children = list(my_xml_root)\n", "\n", "# Check tag, attribute and text of first child...\n", "all_children[0][0].tag, all_children[0][0].attrib, all_children[0][0].text\n", "\n", "# Loop to see all tag, attribute and text of first child...\n", "for x in all_children[0]:\n", " print(x.tag, x.attrib, x.text)\n", " \n", "# Now we can collect the records into list/dictionary to construct a friend dataframe structure\n", "data_list = []\n", "for child in all_children:\n", " data_dict = {}\n", " \n", " data_dict['Common'] = child.find('COMMON').text\n", " data_dict['Botanical'] = child.find('BOTANICAL').text\n", " data_dict['Zone'] = child.find('ZONE').text\n", " data_dict['Light'] = child.find('LIGHT').text\n", " data_dict['Price'] = child.find('PRICE').text\n", " data_dict['Availability'] = child.find('AVAILABILITY').text\n", " \n", " data_list.append(data_dict)\n", "# =========================================\n", "\n", "# Generate the df...\n", "data_list_df = pd.DataFrame(data_list)\n", "data_list_df\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# XML 4" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name {} Belgian Waffles\n", "price {} $5.95\n", "description {} Two of our famous Belgian Waffles with plenty of real maple syrup\n", "calories {} 650\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NamePriceDescriptionCalories
0Belgian Waffles$5.95Two of our famous Belgian Waffles with plenty ...650
1Strawberry Belgian Waffles$7.95Light Belgian waffles covered with strawberrie...900
2Berry-Berry Belgian Waffles$8.95Light Belgian waffles covered with an assortme...900
3French Toast$4.50Thick slices made from our homemade sourdough ...600
4Homestyle Breakfast$6.95Two eggs, bacon or sausage, toast, and our eve...950
\n", "
" ], "text/plain": [ " Name Price \\\n", "0 Belgian Waffles $5.95 \n", "1 Strawberry Belgian Waffles $7.95 \n", "2 Berry-Berry Belgian Waffles $8.95 \n", "3 French Toast $4.50 \n", "4 Homestyle Breakfast $6.95 \n", "\n", " Description Calories \n", "0 Two of our famous Belgian Waffles with plenty ... 650 \n", "1 Light Belgian waffles covered with strawberrie... 900 \n", "2 Light Belgian waffles covered with an assortme... 900 \n", "3 Thick slices made from our homemade sourdough ... 600 \n", "4 Two eggs, bacon or sausage, toast, and our eve... 950 " ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://www.w3schools.com/xml/simple.xml'\n", "\n", "# Read XML from URL...\n", "xml_response = requests.get(url, timeout=5)\n", "my_xml_root = ET.fromstring(xml_response.text)\n", "\n", "# Get all children...\n", "all_children = list(my_xml_root)\n", "\n", "# Check tag, attribute and text of first child...\n", "all_children[0][0].tag, all_children[0][0].attrib, all_children[0][0].text\n", "\n", "# Loop to see all tag, attribute and text of first child...\n", "for x in all_children[0]:\n", " print(x.tag, x.attrib, x.text)\n", " \n", "# Now we can collect the records into list/dictionary to construct a friend dataframe structure\n", "data_list = []\n", "for child in all_children:\n", " data_dict = {}\n", " \n", " data_dict['Name'] = child.find('name').text\n", " data_dict['Price'] = child.find('price').text\n", " data_dict['Description'] = child.find('description').text\n", " data_dict['Calories'] = child.find('calories').text\n", " \n", " data_list.append(data_dict)\n", "# =========================================\n", "\n", "# Generate the df...\n", "data_list_df = pd.DataFrame(data_list)\n", "data_list_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# XML 5" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "xml_data = '''\n", "\n", " \n", " Gambardella, Matthew\n", " XML Developer's Guide\n", " Computer\n", " 44.95\n", " 2000-10-01\n", " An in-depth look at creating applications \n", " with XML.\n", " \n", " \n", " Ralls, Kim\n", " Midnight Rain\n", " Fantasy\n", " 5.95\n", " 2000-12-16\n", " A former architect battles corporate zombies, \n", " an evil sorceress, and her own childhood to become queen \n", " of the world.\n", " \n", " \n", " Corets, Eva\n", " Maeve Ascendant\n", " Fantasy\n", " 5.95\n", " 2000-11-17\n", " After the collapse of a nanotechnology \n", " society in England, the young survivors lay the \n", " foundation for a new society.\n", " \n", " \n", " Corets, Eva\n", " Oberon's Legacy\n", " Fantasy\n", " 5.95\n", " 2001-03-10\n", " In post-apocalypse England, the mysterious \n", " agent known only as Oberon helps to create a new life \n", " for the inhabitants of London. Sequel to Maeve \n", " Ascendant.\n", " \n", " \n", " Corets, Eva\n", " The Sundered Grail\n", " Fantasy\n", " 5.95\n", " 2001-09-10\n", " The two daughters of Maeve, half-sisters, \n", " battle one another for control of England. Sequel to \n", " Oberon's Legacy.\n", " \n", " \n", " Randall, Cynthia\n", " Lover Birds\n", " Romance\n", " 4.95\n", " 2000-09-02\n", " When Carla meets Paul at an ornithology \n", " conference, tempers fly as feathers get ruffled.\n", " \n", " \n", " Thurman, Paula\n", " Splish Splash\n", " Romance\n", " 4.95\n", " 2000-11-02\n", " A deep sea diver finds true love twenty \n", " thousand leagues beneath the sea.\n", " \n", " \n", " Knorr, Stefan\n", " Creepy Crawlies\n", " Horror\n", " 4.95\n", " 2000-12-06\n", " An anthology of horror stories about roaches,\n", " centipedes, scorpions and other insects.\n", " \n", " \n", " Kress, Peter\n", " Paradox Lost\n", " Science Fiction\n", " 6.95\n", " 2000-11-02\n", " After an inadvertant trip through a Heisenberg\n", " Uncertainty Device, James Salway discovers the problems \n", " of being quantum.\n", " \n", " \n", " O'Brien, Tim\n", " Microsoft .NET: The Programming Bible\n", " Computer\n", " 36.95\n", " 2000-12-09\n", " Microsoft's .NET initiative is explored in \n", " detail in this deep programmer's reference.\n", " \n", " \n", " O'Brien, Tim\n", " MSXML3: A Comprehensive Guide\n", " Computer\n", " 36.95\n", " 2000-12-01\n", " The Microsoft MSXML3 parser is covered in \n", " detail, with attention to XML DOM interfaces, XSLT processing, \n", " SAX and more.\n", " \n", " \n", " Galos, Mike\n", " Visual Studio 7: A Comprehensive Guide\n", " Computer\n", " 49.95\n", " 2001-04-16\n", " Microsoft Visual Studio 7 is explored in depth,\n", " looking at how Visual Basic, Visual C++, C#, and ASP+ are \n", " integrated into a comprehensive development \n", " environment.\n", " \n", "'''" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "author {} Gambardella, Matthew\n", "title {} XML Developer's Guide\n", "genre {} Computer\n", "price {} 44.95\n", "publish_date {} 2000-10-01\n", "description {} An in-depth look at creating applications \n", " with XML.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AuthorTitleGenrePricePublish DateDescription
0Gambardella, MatthewXML Developer's GuideComputer44.952000-10-01An in-depth look at creating applications \\n ...
1Ralls, KimMidnight RainFantasy5.952000-12-16A former architect battles corporate zombies, ...
2Corets, EvaMaeve AscendantFantasy5.952000-11-17After the collapse of a nanotechnology \\n ...
3Corets, EvaOberon's LegacyFantasy5.952001-03-10In post-apocalypse England, the mysterious \\n ...
4Corets, EvaThe Sundered GrailFantasy5.952001-09-10The two daughters of Maeve, half-sisters, \\n ...
5Randall, CynthiaLover BirdsRomance4.952000-09-02When Carla meets Paul at an ornithology \\n ...
6Thurman, PaulaSplish SplashRomance4.952000-11-02A deep sea diver finds true love twenty \\n ...
7Knorr, StefanCreepy CrawliesHorror4.952000-12-06An anthology of horror stories about roaches,\\...
8Kress, PeterParadox LostScience Fiction6.952000-11-02After an inadvertant trip through a Heisenberg...
9O'Brien, TimMicrosoft .NET: The Programming BibleComputer36.952000-12-09Microsoft's .NET initiative is explored in \\n ...
10O'Brien, TimMSXML3: A Comprehensive GuideComputer36.952000-12-01The Microsoft MSXML3 parser is covered in \\n ...
11Galos, MikeVisual Studio 7: A Comprehensive GuideComputer49.952001-04-16Microsoft Visual Studio 7 is explored in depth...
\n", "
" ], "text/plain": [ " Author Title \\\n", "0 Gambardella, Matthew XML Developer's Guide \n", "1 Ralls, Kim Midnight Rain \n", "2 Corets, Eva Maeve Ascendant \n", "3 Corets, Eva Oberon's Legacy \n", "4 Corets, Eva The Sundered Grail \n", "5 Randall, Cynthia Lover Birds \n", "6 Thurman, Paula Splish Splash \n", "7 Knorr, Stefan Creepy Crawlies \n", "8 Kress, Peter Paradox Lost \n", "9 O'Brien, Tim Microsoft .NET: The Programming Bible \n", "10 O'Brien, Tim MSXML3: A Comprehensive Guide \n", "11 Galos, Mike Visual Studio 7: A Comprehensive Guide \n", "\n", " Genre Price Publish Date \\\n", "0 Computer 44.95 2000-10-01 \n", "1 Fantasy 5.95 2000-12-16 \n", "2 Fantasy 5.95 2000-11-17 \n", "3 Fantasy 5.95 2001-03-10 \n", "4 Fantasy 5.95 2001-09-10 \n", "5 Romance 4.95 2000-09-02 \n", "6 Romance 4.95 2000-11-02 \n", "7 Horror 4.95 2000-12-06 \n", "8 Science Fiction 6.95 2000-11-02 \n", "9 Computer 36.95 2000-12-09 \n", "10 Computer 36.95 2000-12-01 \n", "11 Computer 49.95 2001-04-16 \n", "\n", " Description \n", "0 An in-depth look at creating applications \\n ... \n", "1 A former architect battles corporate zombies, ... \n", "2 After the collapse of a nanotechnology \\n ... \n", "3 In post-apocalypse England, the mysterious \\n ... \n", "4 The two daughters of Maeve, half-sisters, \\n ... \n", "5 When Carla meets Paul at an ornithology \\n ... \n", "6 A deep sea diver finds true love twenty \\n ... \n", "7 An anthology of horror stories about roaches,\\... \n", "8 After an inadvertant trip through a Heisenberg... \n", "9 Microsoft's .NET initiative is explored in \\n ... \n", "10 The Microsoft MSXML3 parser is covered in \\n ... \n", "11 Microsoft Visual Studio 7 is explored in depth... " ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms762271(v=vs.85)' # This url isn't an XML page, so we will read from str or file.\n", "\n", "\n", "# Read XML from URL...\n", "# xml_response = requests.get(url, timeout=5)\n", "# my_xml_root = ET.fromstring(xml_response.text)\n", "\n", "# Read XML from str...\n", "my_xml_root = ET.fromstring(xml_data) # to read from file: ET.parse('sample.xml').getroot()\n", "\n", "\n", "# Get all children...\n", "all_children = list(my_xml_root)\n", "\n", "# Check tag, attribute and text of first child...\n", "all_children[0][0].tag, all_children[0][0].attrib, all_children[0][0].text\n", "\n", "# Loop to see all tag, attribute and text of first child...\n", "for x in all_children[0]:\n", " print(x.tag, x.attrib, x.text)\n", " \n", "# Now we can collect the records into list/dictionary to construct a friend dataframe structure\n", "data_list = []\n", "for child in all_children:\n", " data_dict = {}\n", " \n", " data_dict['Author'] = child.find('author').text\n", " data_dict['Title'] = child.find('title').text\n", " data_dict['Genre'] = child.find('genre').text\n", " data_dict['Price'] = child.find('price').text\n", " data_dict['Publish Date'] = child.find('publish_date').text\n", " data_dict['Description'] = child.find('description').text\n", " \n", " \n", " data_list.append(data_dict)\n", "# =========================================\n", "\n", "# Generate the df...\n", "data_list_df = pd.DataFrame(data_list)\n", "data_list_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }